From 449d21b3c6c3cca098e51fa0389de55ccb26b015 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 3 Apr 2017 20:32:50 +0300 Subject: [PATCH 001/171] [HIPIFY] Remove hipLaunchParm in HIP kernel declaration. --- hipify-clang/src/Cuda2Hip.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 6c24fbf288..27447b8d8c 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -2343,9 +2343,6 @@ private: LangOptions DefaultLangOptions; SmallString<40> XStr; raw_svector_ostream OS(XStr); - StringRef initialParamList; - OS << "hipLaunchParm lp"; - size_t repLength = OS.str().size(); SourceLocation sl = kernelDecl->getNameInfo().getEndLoc(); SourceLocation kernelArgListStart = Lexer::findLocationAfterToken(sl, tok::l_paren, *SM, DefaultLangOptions, true); DEBUG(dbgs() << kernelArgListStart.printToString(*SM)); @@ -2355,14 +2352,12 @@ private: SourceLocation kernelArgListStart(pvdFirst->getLocStart()); SourceLocation kernelArgListEnd(pvdLast->getLocEnd()); SourceLocation stop = Lexer::getLocForEndOfToken(kernelArgListEnd, 0, *SM, DefaultLangOptions); - repLength += SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart); - initialParamList = StringRef(SM->getCharacterData(kernelArgListStart), repLength); - OS << ", " << initialParamList; + size_t repLength = SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart); + OS << StringRef(SM->getCharacterData(kernelArgListStart), repLength); + Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str()); + FullSourceLoc fullSL(sl, *(Result.SourceManager)); + insertReplacement(Rep0, fullSL); } - DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n" << "new paramlist: " << OS.str() << "\n"); - Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str()); - FullSourceLoc fullSL(sl, *(Result.SourceManager)); - insertReplacement(Rep0, fullSL); } bool cudaCall(const MatchFinder::MatchResult &Result) { From 143e0af716b279f3a1e15b0ef1bf62103ddcd0a7 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 3 Apr 2017 22:05:01 +0300 Subject: [PATCH 002/171] [HIPIFY] GGL support + hipLaunchKernel -> hipLaunchKernelGGL + macro HIP_KERNEL_NAME is no longer used --- hipify-clang/src/Cuda2Hip.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 27447b8d8c..383af0440c 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -2426,9 +2426,9 @@ private: XStr.clear(); if (calleeName.find(',') != StringRef::npos) { SmallString<128> tmpData; - calleeName = Twine("HIP_KERNEL_NAME(" + calleeName + ")").toStringRef(tmpData); + calleeName = Twine("(" + calleeName + ")").toStringRef(tmpData); } - OS << "hipLaunchKernel(" << calleeName << ","; + OS << "hipLaunchKernelGGL(" << calleeName << ","; const CallExpr *config = launchKernel->getConfig(); DEBUG(dbgs() << "Kernel config arguments:" << "\n"); SourceManager *SM = Result.SourceManager; @@ -2468,7 +2468,7 @@ private: Replacement Rep(*SM, launchKernel->getLocStart(), length, OS.str()); FullSourceLoc fullSL(launchKernel->getLocStart(), *SM); insertReplacement(Rep, fullSL); - hipCounter counter = {"hipLaunchKernel", CONV_KERN, API_RUNTIME}; + hipCounter counter = {"hipLaunchKernelGGL", CONV_KERN, API_RUNTIME}; updateCounters(counter, refName.str()); return true; } From 958af4dae14caa91f5f3a5efee5de3d78e3c9b78 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 4 Apr 2017 08:06:09 +0530 Subject: [PATCH 003/171] hip_doc packaging script updates - Gracefully handle missing doxygen and grip tools Change-Id: I1a4a653d687c136c6d9237062ab4d02bc6cb3db1 --- packaging/hip_doc.txt | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/packaging/hip_doc.txt b/packaging/hip_doc.txt index bbcaf54ec8..d5a0c471b1 100644 --- a/packaging/hip_doc.txt +++ b/packaging/hip_doc.txt @@ -1,12 +1,19 @@ cmake_minimum_required(VERSION 2.8.3) project(hip_doc) -add_custom_target(build_doxygen ALL +find_program(DOXYGEN_EXE doxygen) +if(DOXYGEN_EXE) + add_custom_target(build_doxygen ALL COMMAND HIP_PATH=@hip_SOURCE_DIR@ doxygen @hip_SOURCE_DIR@/docs/doxygen-input/doxy.cfg) -add_custom_target(convert_md_to_html ALL + install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI) +endif() + +find_program(GRIP_EXE grip) +if(GRIP_EXE) + add_custom_target(convert_md_to_html ALL COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html) -install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI) -install(DIRECTORY md2html/ DESTINATION docs) + install(DIRECTORY md2html/ DESTINATION docs) +endif() ############################# # Packaging steps From 93887e9c83d883d9cb8c92cce3dc7356b9f25a56 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 4 Apr 2017 08:07:56 +0530 Subject: [PATCH 004/171] Remove unused packaging scripts Change-Id: I609391b905810eb24f7fd4ea6d7f27166ca001b3 --- packaging/create_hip_samples_installer.sh | 23 ----------------------- 1 file changed, 23 deletions(-) delete mode 100755 packaging/create_hip_samples_installer.sh diff --git a/packaging/create_hip_samples_installer.sh b/packaging/create_hip_samples_installer.sh deleted file mode 100755 index 91789d2524..0000000000 --- a/packaging/create_hip_samples_installer.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -function die { - echo "${1-Died}." >&2 - exit 1 -} - -payload=$1 -script=$2 -[ "$payload" != "" ] || [ "$script" != "" ] || die "Invalid arguments!" -tmp=__extract__$RANDOM - -printf "#!/bin/bash -samples_dir=\$1 -[ \"\$samples_dir\" != \"\" ] || read -e -p \"Enter the path to extract the HIP samples: \" samples_dir -mkdir -p \$samples_dir -PAYLOAD=\`awk '/^__PAYLOAD_BELOW__/ {print NR + 1; exit 0; }' \$0\` -tail -n+\$PAYLOAD \$0 | tar -xz -C \$samples_dir -echo \"HIP samples installed in \$samples_dir\" -exit 0 -__PAYLOAD_BELOW__\n" > "$tmp" - -cat "$tmp" "$payload" > "$script" && rm "$tmp" -chmod +x "$script" From fc61b793fe0e53896509f71b8c3b1c401aa7312d Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 4 Apr 2017 15:51:10 +0530 Subject: [PATCH 005/171] mgpu IPC support fix Change-Id: I12e4b2fd189c3658efd3b07defa18ece3853b0eb --- src/hip_memory.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 805fc9efc0..da5530349f 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -1260,10 +1260,15 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle; //Attach ipc memory - hsa_status_t hsa_status = - hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, 1, agent, devPtr); - if(hsa_status != HSA_STATUS_SUCCESS) - hipStatus = hipErrorMapBufferObjectFailed; + auto ctx= ihipGetTlsDefaultCtx(); + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + // the peerCnt always stores self so make sure the trace actually + hsa_status_t hsa_status = + hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr); + if(hsa_status != HSA_STATUS_SUCCESS) + hipStatus = hipErrorMapBufferObjectFailed; + } #else hipStatus = hipErrorRuntimeOther; #endif From a98802f72c40f96a3ad23cc4902417ca17799f2c Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Wed, 5 Apr 2017 16:23:27 -0500 Subject: [PATCH 006/171] hip_debug.md update from Alex, regarding __device__ function restrictions Change-Id: I5e54fd97fc632d4283f76282e3935396a1aad235 --- docs/markdown/hip_bugs.md | 90 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/docs/markdown/hip_bugs.md b/docs/markdown/hip_bugs.md index e15c37fc54..addf2c17f6 100644 --- a/docs/markdown/hip_bugs.md +++ b/docs/markdown/hip_bugs.md @@ -5,6 +5,7 @@ - [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**](#errors-related-to-undefined-reference-to-hclaunchkernel__grid_launch_parm) - [Application hangs after a hipLaunchKernel call](#what-if-i-see-application-hangs-after-a-hiplaunchkernel-call) - [What is the current limitation of HIP Generic Grid Launch method?](#what-is-the-current-limitation-of-hip-generic-grid-launch-method) +- [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions) @@ -46,4 +47,91 @@ hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_H ### What is the current limitation of HIP Generic Grid Launch method? 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak))); -2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch. \ No newline at end of file +2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch. + + +### HIP is more restrictive in enforcing restrictions +By the language specification, both for HIP and CUDA it is forbidden to call a +`__device__` function in a `__host__` context. In practice, you may observe +differences in the strictness of this restriction, with HIP exhibiting a tighter +adherence to the specification i.e. being less tolerant of infringing code. The +solution is to always ensure that all functions which are to be called in a +`__device__` context are correctly annotated to reflect it. An interesting case +where these differences emerge is shown below (this has been lifted from +production code, and relies on a the common [C++ Member Detector idiom][1], as it +would be implemented pre C++11): +```c++ +#include +#include + +struct meta_yes { char a[1]; }; +struct meta_no { char a[2]; }; + +// Dual restriction is necessary in HIP if the detector is to work for +// __device__ contexts as well as __host__ ones. NVCC is less strict. +template +__host__ __device__ +const T& return_ref(); + +template +struct has_nullary_operator { + // Dual restriction is necessary in HIP if the detector is to work for + // __device__ contexts as well as __host__ ones. NVCC is less strict. + template + __host__ __device__ + static + meta_yes testFunctor( + C const *, + typename std::enable_if< + (sizeof(return_ref().operator()()) > 0)>::type* = nullptr); + static + meta_no testFunctor(...); + + enum { + value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; +}; + +template< + typename Scalar, + typename NullaryOp, + bool has_nullary = has_nullary_operator::value> +struct nullary_wrapper { + template + T packetOp() const { return T{1}; } +}; + + +template +struct nullary_wrapper { + template + T packetOp() const { return T{10}; } +}; + +// This specialisation will fail to compile. +template +struct nullary_wrapper {}; + +template +struct UniformRandomGenerator; + +template<> struct UniformRandomGenerator { + float operator()() const [[hc]] { return 42.0; } +}; + +__device__ +void this_will_not_compile_if_detector_is_not_marked_device() +{ + float f = + nullary_wrapper< + float, UniformRandomGenerator>().packetOp(); +} + +__host__ +void this_will_not_compile_if_detector_is_marked_device_only() +{ + float f = + nullary_wrapper< + float, UniformRandomGenerator>().packetOp(); +} +``` +[1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector \ No newline at end of file From b728637692461cdc08e7af04d06644ab79d19f94 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 5 Apr 2017 17:43:20 -0500 Subject: [PATCH 007/171] Add bug descrip for "no matching constructor" --- docs/markdown/hip_bugs.md | 66 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/docs/markdown/hip_bugs.md b/docs/markdown/hip_bugs.md index addf2c17f6..234dec4e0e 100644 --- a/docs/markdown/hip_bugs.md +++ b/docs/markdown/hip_bugs.md @@ -50,6 +50,70 @@ hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_H 2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch. +### Errors related to `no matching constructor` + +The symptom is the compiler would complain about errors like `no matching constructor` for classes/structs passed as arguments into a GPU kernel. Often, this is caused by a design limitation in HCC where array-typed member variables inside a class/struct can’t be correctly passed into GPU kernels. To mitigate this issue, a custom serializer/deserializer pair is provided. + +For example, `Foo` in the code snippets below contains an array-typed member variable `table`, which would fail the compiler if used as a kernel argument. + +``` +struct Foo { + // table is an array, which makes foo + int table[3]; +}; +``` + +An workaround is to provide a custom serializer on CPU side, and append the contents of the array as kernel arguments: + +``` + +struct Foo { + int table[3]; + + // user-provided CPU serializer + // must append the contents of the array member as kernel arguments +#ifdef __HCC__ + __attribute__((annotate(“serialize”))) + void __cxxamp_serialize(Kalmar::Serialize &s) const { + for (int i = 0; i < 3; ++i) + s.Append(sizeof(int), &table[i]); + } +#endif +}; +``` + +Then, provide a custom deserializer on GPU side, to help reconstruct the array within GPU kernels. Notice that the deserializer can not be a function template, and should have scalar-typed parameters of the number equals to the length of the array-typed member variable. For example: + +``` +struct Foo { + int table[3]; + + // user-provided GPU deserializer + // table has 3 int elements, so deserializer must have 3 int parameters. +#ifdef __HCC__ + __attribute__((annotate(“user_deserialize”))) + Foo(int x0, int x1, int x2) [[cpu]][[hc]] { + table[0] = x0; + table[1] = x1; + table[2] = x2; + } +#endif + +#ifdef __HCC__ + __attribute__((annotate(“serialize”))) + void __cxxamp_serialize(Kalmar::Serialize &s) const { + s.Append(sizeof(int), &table[0]); + s.Append(sizeof(int), &table[1]); + s.Append(sizeof(int), &table[2]); + } +#endif +}; +``` + + +Rather than create serializer functions, another workaround is to pass the member fields from the structure as simple data types. + + ### HIP is more restrictive in enforcing restrictions By the language specification, both for HIP and CUDA it is forbidden to call a `__device__` function in a `__host__` context. In practice, you may observe @@ -134,4 +198,4 @@ void this_will_not_compile_if_detector_is_marked_device_only() float, UniformRandomGenerator>().packetOp(); } ``` -[1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector \ No newline at end of file +[1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector From aaadc6026457d207db223b151ef1b2f2dceca737 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 5 Apr 2017 19:40:00 -0500 Subject: [PATCH 008/171] Doc cleanup & add bug descript for restrict specifier issue --- docs/markdown/hip_bugs.md | 92 +++++++++++++++------------------------ 1 file changed, 34 insertions(+), 58 deletions(-) diff --git a/docs/markdown/hip_bugs.md b/docs/markdown/hip_bugs.md index 234dec4e0e..c53b68d796 100644 --- a/docs/markdown/hip_bugs.md +++ b/docs/markdown/hip_bugs.md @@ -15,9 +15,11 @@ Some common code practices may lead to hipcc generating a error with the form : undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_ To workaround, try: -- Avoid calling hcLaunchKernel from a function with the __host__ attribute +- Avoid calling hipLaunchKernel from a function with the __host__ attribute +``` __host__ MyFunc(…) { hipLaunchKernel(myKernel, …) +``` - Avoid use of static with kernel definition: static __global__ MyKernel - Avoid defining kernels in anonymous namespace @@ -25,25 +27,6 @@ namespace { __global__ MyKernel … - Avoid calling member functions -If hipLaunchKernel takes parameters that request explicitly memcpy, then it will cause application hang. -Reason is that the hipLaunchKernel macro locks the stream. -If kernel paramters are actually function calls which invoke other hip apis (i.e. memcpy) to the same stream, then deadlock occurs. - -To workaround, try: -Move the function calls so they occur outside the hipLaunchKernel macro, store results in temps, then use the tems inside the kernel. - -``` -// Example pseudo code causing system hang: -// "bottom[0]->gpu_data()" calls hipMemcpy() implicitly and using the same stream, cause deadlock condition. -hipLaunchKernel(HIP_KERNEL_NAME(LRNComputeDiff),dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads, - bottom[0]->gpu_data()); - -// Move "gpu_data()" ouside of hipLaunchKernel to avoid hang. -auto bot_gpu_data = bottom[0]->gpu_data(); -hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads, - bot_gpu_data); - -``` ### What is the current limitation of HIP Generic Grid Launch method? 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak))); @@ -115,87 +98,80 @@ Rather than create serializer functions, another workaround is to pass the membe ### HIP is more restrictive in enforcing restrictions -By the language specification, both for HIP and CUDA it is forbidden to call a +The language specification for HIP and CUDA forbid calling a `__device__` function in a `__host__` context. In practice, you may observe differences in the strictness of this restriction, with HIP exhibiting a tighter -adherence to the specification i.e. being less tolerant of infringing code. The -solution is to always ensure that all functions which are to be called in a +adherence to the specification and thus less tolerant of infringing code. The +solution is to ensure that all functions which are called in a `__device__` context are correctly annotated to reflect it. An interesting case -where these differences emerge is shown below (this has been lifted from -production code, and relies on a the common [C++ Member Detector idiom][1], as it -would be implemented pre C++11): +where these differences emerge is shown below. This relies on a the common +[C++ Member Detector idiom][1], as it would be implemented pre C++11): + ```c++ #include #include -struct meta_yes { char a[1]; }; -struct meta_no { char a[2]; }; +struct aye { bool a[1]; }; +struct nay { bool a[2]; }; // Dual restriction is necessary in HIP if the detector is to work for // __device__ contexts as well as __host__ ones. NVCC is less strict. template __host__ __device__ -const T& return_ref(); +const T& cref_t(); template -struct has_nullary_operator { +struct Has_call_operator { // Dual restriction is necessary in HIP if the detector is to work for // __device__ contexts as well as __host__ ones. NVCC is less strict. template __host__ __device__ static - meta_yes testFunctor( + aye test( C const *, typename std::enable_if< - (sizeof(return_ref().operator()()) > 0)>::type* = nullptr); + (sizeof(cref_t().operator()()) > 0)>::type* = nullptr); static - meta_no testFunctor(...); + nay test(...); - enum { - value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; + enum { value = sizeof(test(static_cast(0))) == sizeof(aye) }; }; -template< - typename Scalar, - typename NullaryOp, - bool has_nullary = has_nullary_operator::value> -struct nullary_wrapper { - template - T packetOp() const { return T{1}; } +template::value> +struct Wrapper { + template + V f() const { return T{1}; } }; -template -struct nullary_wrapper { - template - T packetOp() const { return T{10}; } +template +struct Wrapper { + template + V f() const { return T{10}; } }; -// This specialisation will fail to compile. -template -struct nullary_wrapper {}; +// This specialisation will yield a compile-time error, if selected. +template +struct Wrapper {}; template -struct UniformRandomGenerator; +struct Functor; -template<> struct UniformRandomGenerator { - float operator()() const [[hc]] { return 42.0; } +template<> struct Functor { + __device__ + float operator()() const { return 42.0f; } }; __device__ void this_will_not_compile_if_detector_is_not_marked_device() { - float f = - nullary_wrapper< - float, UniformRandomGenerator>().packetOp(); + float f = Wrapper>().f(); } __host__ void this_will_not_compile_if_detector_is_marked_device_only() { - float f = - nullary_wrapper< - float, UniformRandomGenerator>().packetOp(); + float f = Wrapper>().f(); } ``` [1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector From 00bf446d53b7def0433d52a99c0cef646655e3d9 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 5 Apr 2017 21:59:11 -0500 Subject: [PATCH 009/171] Update bug workarounds to reflect tool improvements. --- docs/markdown/hip_bugs.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/markdown/hip_bugs.md b/docs/markdown/hip_bugs.md index c53b68d796..14f2935f17 100644 --- a/docs/markdown/hip_bugs.md +++ b/docs/markdown/hip_bugs.md @@ -9,23 +9,23 @@ -### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm** +### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**` Some common code practices may lead to hipcc generating a error with the form : undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_ -To workaround, try: -- Avoid calling hipLaunchKernel from a function with the __host__ attribute -``` -__host__ MyFunc(…) { -hipLaunchKernel(myKernel, …) -``` +Suggested workarounds: - Avoid use of static with kernel definition: +```c++ static __global__ MyKernel -- Avoid defining kernels in anonymous namespace +``` + +- Avoid defining kernels in anonymous namespace : +```c++ namespace { -__global__ MyKernel … -- Avoid calling member functions + __global__ MyKernel +} +``` ### What is the current limitation of HIP Generic Grid Launch method? From 1a8ea824972c99c0c78fef15e91170dc85ecf949 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Wed, 5 Apr 2017 22:25:41 -0500 Subject: [PATCH 010/171] add extra guard to grid_launch_GGL header Change-Id: I120619c08ea2d084804fcb1639efbe6c4648dde9 --- include/hip/hcc_detail/grid_launch_GGL.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 4fd7c3ff3a..1d765dfc48 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -21,6 +21,7 @@ THE SOFTWARE. */ #pragma once +#if GENERIC_GRID_LAUNCH == 1 #include "concepts.hpp" #include "helpers.hpp" @@ -851,3 +852,4 @@ namespace hip_impl ##__VA_ARGS__);\ } } +#endif //GENERIC_GRID_LAUNCH From 6dff2714e95ac65c6662d390dbc57dd232627b66 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 6 Apr 2017 09:29:44 -0500 Subject: [PATCH 011/171] added __host__ for complex functions and corrected memset and memcpy test Change-Id: I9ffefb7a0025aa111a54d20d2766982df15532e7 --- include/hip/hcc_detail/hip_complex.h | 50 ++++++++++++------------- tests/src/deviceLib/hipDeviceMemcpy.cpp | 45 ++++++++++++++++------ 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/include/hip/hcc_detail/hip_complex.h b/include/hip/hcc_detail/hip_complex.h index 9ff75d381a..dd742e484c 100644 --- a/include/hip/hcc_detail/hip_complex.h +++ b/include/hip/hcc_detail/hip_complex.h @@ -177,45 +177,45 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long) #endif -__device__ static inline float hipCrealf(hipFloatComplex z){ +__device__ __host__ static inline float hipCrealf(hipFloatComplex z){ return z.x; } -__device__ static inline float hipCimagf(hipFloatComplex z){ +__device__ __host__ static inline float hipCimagf(hipFloatComplex z){ return z.y; } -__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){ +__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){ hipFloatComplex z; z.x = a; z.y = b; return z; } -__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){ +__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){ hipFloatComplex ret; ret.x = z.x; ret.y = -z.y; return ret; } -__device__ static inline float hipCsqabsf(hipFloatComplex z){ +__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){ return z.x * z.x + z.y * z.y; } -__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){ +__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){ return make_hipFloatComplex(p.x + q.x, p.y + q.y); } -__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){ +__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){ return make_hipFloatComplex(p.x - q.x, p.y - q.y); } -__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){ +__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){ return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); } -__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){ +__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){ float sqabs = hipCsqabsf(q); hipFloatComplex ret; ret.x = (p.x * q.x + p.y * q.y)/sqabs; @@ -223,51 +223,51 @@ __device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatCom return ret; } -__device__ static inline float hipCabsf(hipFloatComplex z){ +__device__ __host__ static inline float hipCabsf(hipFloatComplex z){ return sqrtf(hipCsqabsf(z)); } -__device__ static inline double hipCreal(hipDoubleComplex z){ +__device__ __host__ static inline double hipCreal(hipDoubleComplex z){ return z.x; } -__device__ static inline double hipCimag(hipDoubleComplex z){ +__device__ __host__ static inline double hipCimag(hipDoubleComplex z){ return z.y; } -__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){ +__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){ hipDoubleComplex z; z.x = a; z.y = b; return z; } -__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){ +__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){ hipDoubleComplex ret; ret.x = z.x; ret.y = z.y; return ret; } -__device__ static inline double hipCsqabs(hipDoubleComplex z){ +__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z){ return z.x * z.x + z.y * z.y; } -__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){ +__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){ return make_hipDoubleComplex(p.x + q.x, p.y + q.y); } -__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){ +__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){ return make_hipDoubleComplex(p.x - q.x, p.y - q.y); } -__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){ +__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){ return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); } -__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){ +__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){ double sqabs = hipCsqabs(q); hipDoubleComplex ret; ret.x = (p.x * q.x + p.y * q.y)/sqabs; @@ -275,28 +275,28 @@ __device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleC return ret; } -__device__ static inline double hipCabs(hipDoubleComplex z){ +__device__ __host__ static inline double hipCabs(hipDoubleComplex z){ return sqrtf(hipCsqabs(z)); } typedef hipFloatComplex hipComplex; -__device__ static inline hipComplex make_hipComplex(float x, +__device__ __host__ static inline hipComplex make_hipComplex(float x, float y){ return make_hipFloatComplex(x, y); } -__device__ static inline hipFloatComplex hipComplexDoubleToFloat +__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat (hipDoubleComplex z){ return make_hipFloatComplex((float)z.x, (float)z.y); } -__device__ static inline hipDoubleComplex hipComplexFloatToDouble +__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble (hipFloatComplex z){ return make_hipDoubleComplex((double)z.x, (double)z.y); } -__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){ +__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){ float real = (p.x * q.x) + r.x; float imag = (q.x * p.y) + r.y; @@ -306,7 +306,7 @@ __device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComp return make_hipComplex(real, imag); } -__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){ +__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){ float real = (p.x * q.x) + r.x; float imag = (q.x * p.y) + r.y; diff --git a/tests/src/deviceLib/hipDeviceMemcpy.cpp b/tests/src/deviceLib/hipDeviceMemcpy.cpp index 54fd02c0c2..3843c07bb9 100644 --- a/tests/src/deviceLib/hipDeviceMemcpy.cpp +++ b/tests/src/deviceLib/hipDeviceMemcpy.cpp @@ -1,18 +1,29 @@ -#include +#include #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include "../test_common.h" + #define LEN 1030 #define SIZE LEN << 2 -__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald) +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + + +__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In) { - memcpy(Out, In, SIZE, Vald); + int tx = hipThreadIdx_x; + memcpy(Out + tx, In + tx, SIZE/LEN); } __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size) { - memset(ptr, val, size); + int tx = hipThreadIdx_x; + memset(ptr + tx, val, size); } int main() @@ -24,19 +35,29 @@ int main() Val = new uint32_t; *Val = 0; for(int i=0;i Date: Thu, 6 Apr 2017 10:48:11 -0500 Subject: [PATCH 012/171] GGL update, add while 0 guard for hipLaunchKernel API Change-Id: Ie48ef8ca2ab5e26a51febfcd92417902c33fbf66 --- include/hip/hcc_detail/grid_launch_GGL.hpp | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 1d765dfc48..8f1abbb70b 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -841,15 +841,16 @@ namespace hip_impl group_mem_bytes,\ stream,\ ...)\ - {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } + do {\ + hipLaunchKernelGGL(\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + hipLaunchParm{},\ + ##__VA_ARGS__);\ + } while(0) + } #endif //GENERIC_GRID_LAUNCH From 0a07382d925e8542e4cb309b9cbcb2abe8565711 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 6 Apr 2017 16:43:26 -0500 Subject: [PATCH 013/171] fixed header structure for complex data types Change-Id: I16bf19005d933f42e8c8603c5d0b2df8ea3ad04f --- include/hip/hcc_detail/hip_complex.h | 3 +-- include/hip/hcc_detail/hip_fp16.h | 2 +- include/hip/hcc_detail/hip_vector_types.h | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/hip/hcc_detail/hip_complex.h b/include/hip/hcc_detail/hip_complex.h index dd742e484c..26d73a21a8 100644 --- a/include/hip/hcc_detail/hip_complex.h +++ b/include/hip/hcc_detail/hip_complex.h @@ -23,8 +23,7 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H -#include "./hip_fp16.h" -#include "./hip_vector_types.h" +#include "hip/hcc_detail/hip_vector_types.h" #if __cplusplus #define COMPLEX_ADD_OP_OVERLOAD(type) \ diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h index febc1b4fce..0a861b64af 100644 --- a/include/hip/hcc_detail/hip_fp16.h +++ b/include/hip/hcc_detail/hip_fp16.h @@ -23,7 +23,7 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H -#include "hip/hip_runtime.h" +#include "hip/hcc_detail/hip_vector_types.h" #if __clang_major__ > 3 diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h index 42e1d6663c..82bd3b2d6f 100644 --- a/include/hip/hcc_detail/hip_vector_types.h +++ b/include/hip/hcc_detail/hip_vector_types.h @@ -32,7 +32,7 @@ THE SOFTWARE. #error("This version of HIP requires a newer version of HCC."); #endif -#include "host_defines.h" +#include "hip/hcc_detail/host_defines.h" #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ __device__ __host__ type() {} \ From 599596555884903bc7670d885c2d34349190501f Mon Sep 17 00:00:00 2001 From: sunway513 Date: Thu, 6 Apr 2017 23:54:00 +0000 Subject: [PATCH 014/171] Improve documentation for hipModuleLaunch functions. Change-Id: I0e22621e499775740c3301347b7416d5f98c2414 --- include/hip/hcc_detail/hip_hcc.h | 44 +++++++++++++++++++++++- include/hip/hcc_detail/hip_runtime_api.h | 23 ++++++------- 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/include/hip/hcc_detail/hip_hcc.h b/include/hip/hcc_detail/hip_hcc.h index 645e980376..889e04eb9f 100644 --- a/include/hip/hcc_detail/hip_hcc.h +++ b/include/hip/hcc_detail/hip_hcc.h @@ -28,6 +28,17 @@ THE SOFTWARE. #if __cplusplus #ifdef __HCC__ #include + + +/** + *------------------------------------------------------------------------------------------------- + *------------------------------------------------------------------------------------------------- + * @defgroup HCC-specific features + * @warning These APIs provide access to special features of HCC compiler and are not available through the CUDA path. + * @{ + */ + + /** * @brief Return hc::accelerator associated with the specified deviceId * @return #hipSuccess, #hipErrorInvalidDevice @@ -45,6 +56,30 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a #endif // #ifdef __HCC__ +/** + * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra + * + * @param [in[ f Kernel to launch. + * @param [in] gridDimX X grid dimension specified in work-items + * @param [in] gridDimY Y grid dimension specified in work-items + * @param [in] gridDimZ Z grid dimension specified in work-items + * @param [in] blockDimX X block dimensions specified in work-items + * @param [in] blockDimY Y grid dimension specified in work-items + * @param [in] blockDimZ Z grid dimension specified in work-items + * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. + * @param [in] kernelParams + * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. + * @param [in] startEvent If non-null, specified event will be updated to track the start time of the kernel launch. The event must be created before calling this API. + * @param [in] stopEvent If non-null, specified event will be updated to track the stop time of the kernel launch. The event must be created before calling this API. + * + * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue + + * If startNanos or stopNanos is specified, this API will record and return the start and stop timestamps for the command. The timestamps are collected on the GPU device + * and converted into ns resolution. Typically programs will specify both pointers. Collecting performance timestamps may have a small overhead (approx 1us). + * + * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage. + */ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, @@ -55,8 +90,15 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, size_t sharedMemBytes, hipStream_t hStream, void **kernelParams, - void **extra); + void **extra, + uint64_t *startNanos=nullptr, + uint64_t *stopNanos=nullptr + ); +// doxygen end HCC-specific features +/** + * @} + */ #endif // #if __cplusplus #endif // diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 0daca7a53b..f9bfb5a310 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -1913,19 +1913,18 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); /** * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra * - * @param [in[ f - * @param [in] gridDimX - * @param [in] gridDimY - * @param [in] gridDimZ - * @param [in] blockDimX - * @param [in] blockDimY - * @param [in] blockDimZ - * @param [in] sharedMemBytes - * @param [in] stream - * @param [in] kernelParams - * @param [in] extraa + * @param [in[ f Kernel to launch. + * @param [in] gridDimX X grid dimension specified as multiple of blockDimX. + * @param [in] gridDimY Y grid dimension specified as multiple of blockDimY. + * @param [in] gridDimZ Z grid dimension specified as multiple of blockDimZ. + * @param [in] blockDimX X block dimensions specified in work-items + * @param [in] blockDimY Y grid dimension specified in work-items + * @param [in] blockDimZ Z grid dimension specified in work-items + * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. + * @param [in] kernelParams + * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. * - * The function takes the above arguments and run the kernel in hipFunction_t f. with launch parameters specified in gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY and blockDimmZ. The amount of shared memory is specificed and can be used with HIP_DYNAMIC_SHARED. The arguemt extra is used to pass in the arguments for the kernel. * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue * * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage. From cfa3155082b6ab3eb91b78cbed3e269e9a846ce4 Mon Sep 17 00:00:00 2001 From: sunway513 Date: Thu, 6 Apr 2017 23:55:15 +0000 Subject: [PATCH 015/171] Refactor events and add initial event option for hipHccModuleLaunchKernel - Change hipEvent_t to a class. - Move event logic inside the class. - Add _type to support Independent, StartCommand, StopCommand events. StartCommand returns start timestamp from events. Change-Id: I4ddd694f2645a3ff7170c9111dc1d3e39931ca21 --- src/hip_event.cpp | 60 ++++++++++++++++++++++++++++++++++++------ src/hip_hcc.cpp | 17 ------------ src/hip_hcc_internal.h | 31 ++++++++++++++++------ src/hip_module.cpp | 32 ++++++++++++++++------ 4 files changed, 99 insertions(+), 41 deletions(-) diff --git a/src/hip_event.cpp b/src/hip_event.cpp index d44f201db5..61ac5cd3ab 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -30,6 +30,54 @@ THE SOFTWARE. //--- +ihipEvent_t::ihipEvent_t(unsigned flags) +{ + _state = hipEventStatusCreated; + _stream = NULL; + _flags = flags; + _timestamp = 0; + _type = hipEventTypeIndependent; +}; + + + +// Attach to an existing completion future: +void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType) +{ + _state = hipEventStatusRecording; + _marker = *cf; + _type = eventType; +} + + + +void ihipEvent_t::setTimestamp() +{ + if (_state == hipEventStatusRecorded) { + // already recorded, done: + return; + } else { + // TODO - use completion-future functions to obtain ticks and timestamps: + hsa_signal_t *sig = static_cast (_marker.get_native_handle()); + if (sig) { + if (hsa_signal_load_acquire(*sig) == 0) { + + if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) { + _timestamp = _marker.get_end_tick(); + } else if (_type == hipEventTypeStartCommand) { + _timestamp = _marker.get_begin_tick(); + } else { + assert(0); // TODO - move to debug assert + _timestamp = 0; + } + + _state = hipEventStatusRecorded; + } + } + } +} + + hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) { hipError_t e = hipSuccess; @@ -37,12 +85,8 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) // TODO-IPC - support hipEventInterprocess. unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming; if ((flags & ~supportedFlags) == 0) { - ihipEvent_t *eh = new ihipEvent_t(); + ihipEvent_t *eh = new ihipEvent_t(flags); - eh->_state = hipEventStatusCreated; - eh->_stream = NULL; - eh->_flags = flags; - eh->_timestamp = 0; *event = eh; } else { e = hipErrorInvalidValue; @@ -141,8 +185,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) ihipEvent_t *start_eh = start; ihipEvent_t *stop_eh = stop; - ihipSetTs(start); - ihipSetTs(stop); + start->setTimestamp(); + stop->setTimestamp(); hipError_t status = hipSuccess; *ms = 0.0f; @@ -151,7 +195,7 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) if ((start_eh->_state == hipEventStatusRecorded) && (stop_eh->_state == hipEventStatusRecorded)) { // Common case, we have good information for both events. - int64_t tickDiff = (stop_eh->_timestamp - start_eh->_timestamp); + int64_t tickDiff = (stop_eh->timestamp() - start_eh->timestamp()); uint64_t freqHz; hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 374840f91f..35a3e11e71 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1641,23 +1641,6 @@ const char *ihipErrorString(hipError_t hip_error) }; -void ihipSetTs(hipEvent_t e) -{ - ihipEvent_t *eh = e; - if (eh->_state == hipEventStatusRecorded) { - // already recorded, done: - return; - } else { - // TODO - use completion-future functions to obtain ticks and timestamps: - hsa_signal_t *sig = static_cast (eh->_marker.get_native_handle()); - if (sig) { - if (hsa_signal_load_acquire(*sig) == 0) { - eh->_timestamp = eh->_marker.get_end_tick(); - eh->_state = hipEventStatusRecorded; - } - } - } -} // Returns true if copyEngineCtx can see the memory allocated on dstCtx and srcCtx. diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 4b960e2820..459ea3ba2c 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -584,22 +584,39 @@ private: // Data //---- // Internal event structure: enum hipEventStatus_t { - hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use. - hipEventStatusCreated = 1, - hipEventStatusRecording = 2, // event has been enqueued to record something. - hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid. + hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use. + hipEventStatusCreated = 1, + hipEventStatusRecording = 2, // event has been enqueued to record something. + hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid. } ; +// TODO - rename to ihip type of some kind +enum ihipEventType_t { + hipEventTypeIndependent, + hipEventTypeStartCommand, + hipEventTypeStopCommand, +}; // internal hip event structure. -struct ihipEvent_t { - hipEventStatus_t _state; +class ihipEvent_t { +public: + ihipEvent_t(unsigned flags); + void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType); + void setTimestamp(); + uint64_t timestamp() const { return _timestamp; } ; + +public: + hipEventStatus_t _state; hipStream_t _stream; // Stream where the event is recorded, or NULL if all streams. unsigned _flags; hc::completion_future _marker; + +private: + ihipEventType_t _type; uint64_t _timestamp; // store timestamp, may be set on host or by marker. +friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream); } ; @@ -822,8 +839,6 @@ extern hipError_t ihipDeviceSetState(); extern ihipDevice_t *ihipGetDevice(int); ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex); -extern void ihipSetTs(hipEvent_t e); - hipStream_t ihipSyncAndResolveStream(hipStream_t); diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 67bba5f935..c8555672c3 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -364,10 +364,11 @@ hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, hipError_t ihipModuleLaunchKernel(hipFunction_t f, - uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, - uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, - size_t sharedMemBytes, hipStream_t hStream, - void **kernelParams, void **extra) + uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, + uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, + size_t sharedMemBytes, hipStream_t hStream, + void **kernelParams, void **extra, + hipEvent_t *startEvent, hipEvent_t *stopEvent) { auto ctx = ihipGetTlsDefaultCtx(); @@ -446,7 +447,20 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); }; - lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, nullptr/*completion_future*/); + + hc::completion_future cf; + + lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, + (startEvent || stopEvent) ? &cf : nullptr); + + + if (startEvent) { + (*startEvent)->attachToCompletionFuture(&cf, hipEventTypeStartCommand); + } + if (stopEvent) { + (*stopEvent)->attachToCompletionFuture (&cf, hipEventTypeStopCommand); + } + if(kernelParams != NULL){ free(config[1]); @@ -470,7 +484,8 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, return ihipLogStatus(ihipModuleLaunchKernel(f, blockDimX * gridDimX, blockDimY * gridDimY, gridDimZ * blockDimZ, blockDimX, blockDimY, blockDimZ, - sharedMemBytes, hStream, kernelParams, extra)); + sharedMemBytes, hStream, kernelParams, extra, + nullptr, nullptr)); } @@ -478,7 +493,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, - void **kernelParams, void **extra) + void **kernelParams, void **extra, + hipEvent_t *startEvent, hipEvent_t *stopEvent) { HIP_INIT_API(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, @@ -486,7 +502,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, kernelParams, extra); return ihipLogStatus(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, - sharedMemBytes, hStream, kernelParams, extra)); + sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent)); } hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, From 6656d33f75bbc13c48e6e24b02ec43dc43af1fac Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 6 Apr 2017 21:00:00 -0500 Subject: [PATCH 016/171] Finish adding start/stop event support to hipHccModuleLaunchKernel. Change interface to use hipEvent_t rather than hipEvent_t* Change-Id: I259062dc087a13d51dc27f84e1e8861f332a104d --- include/hip/hcc_detail/hip_hcc.h | 11 +++++------ src/hip_hcc_internal.h | 1 + src/hip_module.cpp | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/hip/hcc_detail/hip_hcc.h b/include/hip/hcc_detail/hip_hcc.h index 889e04eb9f..fc04917931 100644 --- a/include/hip/hcc_detail/hip_hcc.h +++ b/include/hip/hcc_detail/hip_hcc.h @@ -70,15 +70,14 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. * @param [in] kernelParams * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. - * @param [in] startEvent If non-null, specified event will be updated to track the start time of the kernel launch. The event must be created before calling this API. + * @param [in] startEvent If non-null, specified event will be updated to track the start time of the kernel launch. The event must be created before calling this API. * @param [in] stopEvent If non-null, specified event will be updated to track the stop time of the kernel launch. The event must be created before calling this API. * * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue - - * If startNanos or stopNanos is specified, this API will record and return the start and stop timestamps for the command. The timestamps are collected on the GPU device - * and converted into ns resolution. Typically programs will specify both pointers. Collecting performance timestamps may have a small overhead (approx 1us). * * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage. + + * HIP/ROCm actually updates the start event when the associated kernel completes. */ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, @@ -91,8 +90,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, hipStream_t hStream, void **kernelParams, void **extra, - uint64_t *startNanos=nullptr, - uint64_t *stopNanos=nullptr + hipEvent_t startEvent=nullptr, + hipEvent_t stopEvent=nullptr ); // doxygen end HCC-specific features diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 459ea3ba2c..9c17c6e98c 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -604,6 +604,7 @@ public: void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType); void setTimestamp(); uint64_t timestamp() const { return _timestamp; } ; + ihipEventType_t type() const { return _type; }; public: hipEventStatus_t _state; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index c8555672c3..b359e7a63c 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -368,7 +368,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, - hipEvent_t *startEvent, hipEvent_t *stopEvent) + hipEvent_t startEvent, hipEvent_t stopEvent) { auto ctx = ihipGetTlsDefaultCtx(); @@ -455,10 +455,10 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, if (startEvent) { - (*startEvent)->attachToCompletionFuture(&cf, hipEventTypeStartCommand); + startEvent->attachToCompletionFuture(&cf, hipEventTypeStartCommand); } if (stopEvent) { - (*stopEvent)->attachToCompletionFuture (&cf, hipEventTypeStopCommand); + stopEvent->attachToCompletionFuture (&cf, hipEventTypeStopCommand); } @@ -494,7 +494,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, - hipEvent_t *startEvent, hipEvent_t *stopEvent) + hipEvent_t startEvent, hipEvent_t stopEvent) { HIP_INIT_API(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, From 6e565d8469e9fe4a3da95ee60155dfe5ca85e95b Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 7 Apr 2017 14:51:54 +0530 Subject: [PATCH 017/171] Fix build issues in hipCommander sample - Remove -stdlib=libstdc++ from Makefile - Removed deleted HIP header file fom includes Change-Id: Ia189396bee19fc52b679259df56c6c6e2bafb6fe --- samples/1_Utils/hipCommander/Makefile | 3 --- samples/1_Utils/hipCommander/hipCommander.cpp | 1 - 2 files changed, 4 deletions(-) diff --git a/samples/1_Utils/hipCommander/Makefile b/samples/1_Utils/hipCommander/Makefile index e770c636a4..a411763b7f 100644 --- a/samples/1_Utils/hipCommander/Makefile +++ b/samples/1_Utils/hipCommander/Makefile @@ -10,9 +10,6 @@ OPT=-O3 CXXFLAGS = $(OPT) --std=c++11 HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) -ifeq (${HIP_PLATFORM}, hcc) - CXXFLAGS += " -stdlib=libc++" -endif CODE_OBJECTS=nullkernel.hsaco diff --git a/samples/1_Utils/hipCommander/hipCommander.cpp b/samples/1_Utils/hipCommander/hipCommander.cpp index 0add1ce3e3..4b93180b18 100644 --- a/samples/1_Utils/hipCommander/hipCommander.cpp +++ b/samples/1_Utils/hipCommander/hipCommander.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #endif #include From 935e3cd64975274f322d0b5c7064e6b1fc6f49b9 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 7 Apr 2017 15:24:10 +0530 Subject: [PATCH 018/171] Fix build issues with bit_extract sample Change-Id: I628b3c83a16f7adf0ab8ca60aecde8c073c34fd9 --- samples/0_Intro/bit_extract/Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/samples/0_Intro/bit_extract/Makefile b/samples/0_Intro/bit_extract/Makefile index 78f6a2faa8..08bca6e642 100644 --- a/samples/0_Intro/bit_extract/Makefile +++ b/samples/0_Intro/bit_extract/Makefile @@ -11,10 +11,6 @@ HIPCC=$(HIP_PATH)/bin/hipcc ifeq (${HIP_PLATFORM}, nvcc) HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 endif -ifeq (${HIP_PLATFORM}, hcc) - HIPCC_FLAGS = -stdlib=libc++ -endif - EXE=bit_extract From 16215ea9b2718f3d2a98f5d208a4906b0a878017 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 7 Apr 2017 15:38:56 +0530 Subject: [PATCH 019/171] Updated table of contents in markdown documentation Change-Id: I7347a06f57f9927ca3fcc5590a6c8200bc1bb1f5 --- docs/markdown/hip_bugs.md | 6 ++-- docs/markdown/hip_faq.md | 9 +++--- docs/markdown/hip_kernel_language.md | 1 + docs/markdown/hip_porting_guide.md | 9 ++++-- docs/markdown/hip_profiling.md | 46 ++++++++++++++++------------ 5 files changed, 41 insertions(+), 30 deletions(-) diff --git a/docs/markdown/hip_bugs.md b/docs/markdown/hip_bugs.md index 14f2935f17..73133843bc 100644 --- a/docs/markdown/hip_bugs.md +++ b/docs/markdown/hip_bugs.md @@ -1,10 +1,10 @@ -# HIP Bugs +# HIP Bugs -- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**](#errors-related-to-undefined-reference-to-hclaunchkernel__grid_launch_parm) -- [Application hangs after a hipLaunchKernel call](#what-if-i-see-application-hangs-after-a-hiplaunchkernel-call) +- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`](#errors-related-to-undefined-reference-to-__hclaunchkernel____grid_launch_parm) - [What is the current limitation of HIP Generic Grid Launch method?](#what-is-the-current-limitation-of-hip-generic-grid-launch-method) +- [Errors related to `no matching constructor`](#errors-related-to-no-matching-constructor) - [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions) diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md index 8ccb458103..e316d449ef 100644 --- a/docs/markdown/hip_faq.md +++ b/docs/markdown/hip_faq.md @@ -4,7 +4,7 @@ - [What APIs and features does HIP support?](#what-apis-and-features-does-hip-support) - [What is not supported?](#what-is-not-supported) - * [Run-time features](#run-time-features) + * [Runtime/Driver API features](#runtimedriver-api-features) * [Kernel language features](#kernel-language-features) - [Is HIP a drop-in replacement for CUDA?](#is-hip-a-drop-in-replacement-for-cuda) - [What specific version of CUDA does HIP support?](#what-specific-version-of-cuda-does-hip-support) @@ -23,10 +23,11 @@ - [On HCC, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang ?](#on-hcc-can-i-link-hip-code-with-host-code-compiled-with-another-compiler-such-as-gcc-icc-or-clang-) - [HIP detected my platform (hcc vs nvcc) incorrectly - what should I do?](#hip-detected-my-platform-hcc-vs-nvcc-incorrectly---what-should-i-do) - [Can I install both CUDA SDK and HCC on same machine?](#can-i-install-both-cuda-sdk-and-hcc-on-same-machine) +- [On CUDA, can I mix CUDA code with HIP code?](#on-cuda-can-i-mix-cuda-code-with-hip-code) +- [On HCC, can I use HC functionality with HIP?](#on-hcc-can-i-use-hc-functionality-with-hip) - [How do I trace HIP application flow?](#how-do-i-trace-hip-application-flow) - * [Using CodeXL markers for HIP Functions](#using-codexl-markers-for-hip-functions) - * [Using HIP_TRACE_API](#using-hip_trace_api) -- [How do I enable HIP Generic Grid Launch option?](#how-do-i-enable-hip-generic-grid-launch-option) +- [What if HIP generates error of "symbol multiply defined!" only on AMD machine?](#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine) +- [How do I disable HIP Generic Grid Launch option?](#how-do-i-disable-hip-generic-grid-launch-option) diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md index 0c7f3c8d25..3cb7b17a0c 100644 --- a/docs/markdown/hip_kernel_language.md +++ b/docs/markdown/hip_kernel_language.md @@ -44,6 +44,7 @@ - [Pragma Unroll](#pragma-unroll) - [In-Line Assembly](#in-line-assembly) - [C++ Support](#c-support) +- [Kernel Compilation](#kernel-compilation) diff --git a/docs/markdown/hip_porting_guide.md b/docs/markdown/hip_porting_guide.md index 9f20d12423..72f6384f6d 100644 --- a/docs/markdown/hip_porting_guide.md +++ b/docs/markdown/hip_porting_guide.md @@ -21,6 +21,7 @@ and provides practical suggestions on how to port CUDA code and work through com * [Device-Architecture Properties](#device-architecture-properties) * [Table of Architecture Properties](#table-of-architecture-properties) - [Finding HIP](#finding-hip) +- [hipLaunchKernel](#hiplaunchkernel) - [Compiler Options](#compiler-options) - [Linking Issues](#linking-issues) * [Linking With hipcc](#linking-with-hipcc) @@ -31,9 +32,11 @@ and provides practical suggestions on how to port CUDA code and work through com * [Using a Standard C++ Compiler](#using-a-standard-c-compiler) + [cuda.h](#cudah) * [Choosing HIP File Extensions](#choosing-hip-file-extensions) - * [Workarounds](#workarounds) - + [warpSize](#warpsize) - + [Textures and Cache Control](#textures-and-cache-control) +- [Workarounds](#workarounds) + * [warpSize](#warpsize) +- [memcpyToSymbol](#memcpytosymbol) +- [threadfence_system](#threadfence_system) + * [Textures and Cache Control](#textures-and-cache-control) - [More Tips](#more-tips) * [HIPTRACE Mode](#hiptrace-mode) * [Environment Variables](#environment-variables) diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md index 463c9c13b3..6e5cde700d 100644 --- a/docs/markdown/hip_profiling.md +++ b/docs/markdown/hip_profiling.md @@ -4,26 +4,32 @@ This section describes the profiling and debugging capabilities that HIP provide Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs. This document starts with some of the general capabilities of CodeXL and then describes some of the additional HIP marker and debug features. - * [CodeXL Profiling](#codexl-profiling) - * [Collecting and Viewing Traces](#collecting-and-viewing-traces) - * [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling) - * [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection) - * [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results) - * [More information on CodeXL](#more-information-on-codexl) - * [HIP Markers](#hip-markers) - * [Profiling HIP APIs](#profiling-hip-apis) - * [Adding markers to applications](#adding-markers-to-applications) - * [Additional HIP Profiling Features](#additional-hip-profiling-features) - * [Demangling C Kernel Names](#demangling-c-kernel-names) - * [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends) - * [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size) - * [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time) - * [Tracing and Debug](#tracing-and-debug) - * [Tracing HIP APIs](#tracing-hip-apis) - * [Color](#color) - * [Using HIP_DB](#using-hip_db) - * [Using ltrace](#using-ltrace) - * [Chicken bits](#chicken-bits) + + +- [CodeXL Profiling](#codexl-profiling) + * [Collecting and Viewing Traces](#collecting-and-viewing-traces) + + [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling) + + [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection) + + [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results) + + [More information on CodeXL](#more-information-on-codexl) + * [HIP Markers](#hip-markers) + + [Profiling HIP APIs](#profiling-hip-apis) + + [Adding markers to applications](#adding-markers-to-applications) + * [Additional HIP Profiling Features](#additional-hip-profiling-features) + + [Demangling C++ Kernel Names](#demangling-c-kernel-names) + + [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends) + + [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size) + + [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time) +- [Tracing and Debug](#tracing-and-debug) + * [Tracing HIP APIs](#tracing-hip-apis) + + [Color](#color) + * [Using HIP_DB](#using-hip_db) + * [Using ltrace](#using-ltrace) + * [Chicken bits](#chicken-bits) + * [Debugging HIP Applications](#debugging-hip-applications) + * [General Debugging Tips](#general-debugging-tips) + + ## CodeXL Profiling From 6d4af1ab1fa0c5dfcdd2650c699543f43656825b Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Fri, 7 Apr 2017 14:06:31 -0500 Subject: [PATCH 020/171] update GGL to log launched kernel information Change-Id: Ied0aa6055673c687071b4a579aecd17f0f3f09ce --- include/hip/hcc_detail/grid_launch_GGL.hpp | 33 +++-- include/hip/hcc_detail/helpers.hpp | 150 ++++++++++----------- src/grid_launch.cpp | 36 +++++ 3 files changed, 126 insertions(+), 93 deletions(-) diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 8f1abbb70b..2dd9a95bc6 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -21,6 +21,7 @@ THE SOFTWARE. */ #pragma once + #if GENERIC_GRID_LAUNCH == 1 #include "concepts.hpp" @@ -71,7 +72,7 @@ namespace hip_impl template using is_new_grid_launch_t = typename std::conditional< - std::is_callable{}, + is_callable{}, New_grid_launch_tag, Old_grid_launch_tag>::type; } @@ -118,6 +119,7 @@ namespace hip_impl // TODO: these are workarounds, they should be removed. hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&); + void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t); void unlock_stream_hip_( hipStream_t, void*, const char*, hc::accelerator_view*); @@ -137,7 +139,13 @@ namespace hip_impl void* lck_stream = nullptr; auto acc_v = lock_stream_hip_(stream, lck_stream); auto stream_guard = make_RAII_guard( - [](){ /* perhaps use a slimmed down ihipPrintKernelLaunch here */ }, + std::bind( + print_prelaunch_trace_, + kernel_name, + num_blocks, + dim_blocks, + group_mem_bytes, + stream), std::bind( unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v)); @@ -841,16 +849,15 @@ namespace hip_impl group_mem_bytes,\ stream,\ ...)\ - do {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } while(0) - + do {\ + hipLaunchKernelGGL(\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + hipLaunchParm{},\ + ##__VA_ARGS__);\ + } while(0) } #endif //GENERIC_GRID_LAUNCH diff --git a/include/hip/hcc_detail/helpers.hpp b/include/hip/hcc_detail/helpers.hpp index e5a84a4678..611929766b 100644 --- a/include/hip/hcc_detail/helpers.hpp +++ b/include/hip/hcc_detail/helpers.hpp @@ -21,6 +21,7 @@ THE SOFTWARE. */ #pragma once +#include "concepts.hpp" #include // For std::conditional, std::decay, std::enable_if, // std::false_type, std result_of and std::true_type. @@ -29,9 +30,6 @@ THE SOFTWARE. namespace std { // TODO: these should be removed as soon as possible. #if (__cplusplus < 201406L) - template - using void_t = void; - #if (__cplusplus < 201402L) template using enable_if_t = typename enable_if::type; @@ -43,88 +41,80 @@ namespace std using result_of_t = typename result_of::type; template using remove_reference_t = typename remove_reference::type; - template< - FunctionalProcedure F, - unsigned int n = 0u, - typename = void> - struct is_callable_impl : is_callable_impl {}; - - // Pointer to member function, call through non-pointer. - template - struct is_callable_impl< - F(C, Ts...), - 0u, - void_t().*declval())(declval()...))> - > : true_type { - }; - - // Pointer to member function, call through pointer. - template - struct is_callable_impl< - F(C, Ts...), - 1u, - void_t()).*declval())(declval()...))> - > : std::true_type { - }; - - // Pointer to member data, call through non-pointer, no args. - template - struct is_callable_impl< - F(C), - 2u, - void_t().*declval())> - > : true_type { - }; - - // Pointer to member data, call through pointer, no args. - template - struct is_callable_impl< - F(C), - 3u, - void_t().*declval())> - > : true_type { - }; - - // General call, n args. - template - struct is_callable_impl< - F(Ts...), - 4u, - void_t()(declval()...))> - > : true_type { - }; - - // Not callable. - template - struct is_callable_impl : false_type {}; - - template - struct is_callable : is_callable_impl {}; - #else - template - struct is_callable_impl : false_type {}; - - template - struct is_callable_impl< - F(Ts...), - void_t>> : true_type {}; - - template - struct is_callable : is_callable_impl {}; #endif - template - struct disjunction : false_type {}; - template - struct disjunction : B1 {}; - template - struct disjunction - : conditional_t> - {}; #endif } -namespace hip_impl // Only for documentation, macros ignore namespaces. +namespace hip_impl { + template + using void_t_ = void; + + #if (__cplusplus < 201402L) + template< + FunctionalProcedure F, + unsigned int n = 0u, + typename = void> + struct is_callable_impl : is_callable_impl {}; + + // Pointer to member function, call through non-pointer. + template + struct is_callable_impl< + F(C, Ts...), + 0u, + void_t_().*std::declval())( + std::declval()...))> + > : std::true_type {}; + + // Pointer to member function, call through pointer. + template + struct is_callable_impl< + F(C, Ts...), + 1u, + void_t_()).*std::declval())( + std::declval()...))> + > : std::true_type {}; + + // Pointer to member data, call through non-pointer, no args. + template + struct is_callable_impl< + F(C), + 2u, + void_t_().*std::declval())> + > : std::true_type {}; + + // Pointer to member data, call through pointer, no args. + template + struct is_callable_impl< + F(C), + 3u, + void_t_().*std::declval())> + > : std::true_type {}; + + // General call, n args. + template + struct is_callable_impl< + F(Ts...), + 4u, + void_t_()(std::declval()...))> + > : std::true_type {}; + + // Not callable. + template + struct is_callable_impl : std::false_type {}; + + template + struct is_callable : is_callable_impl {}; + #else + template + struct is_callable_impl : std::false_type {}; + + template + struct is_callable_impl< + F(Ts...), + void_t_>> : std::true_type {}; + #endif + #define count_macro_args_impl_hip_(\ _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15,\ _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29,\ diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp index 7739995600..cac01df7dc 100644 --- a/src/grid_launch.cpp +++ b/src/grid_launch.cpp @@ -27,6 +27,9 @@ THE SOFTWARE. #include "hc.hpp" #include "trace_helper.h" +#include +#include + namespace hip_impl { hc::accelerator_view lock_stream_hip_( @@ -42,6 +45,39 @@ namespace hip_impl return (*static_cast(locked_stream))->_av; } + void print_prelaunch_trace_( + const char* kernel_name, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream) + { + if ((HIP_TRACE_API & (1 << TRACE_CMD)) || + HIP_PROFILE_API || + (COMPILE_HIP_DB && HIP_TRACE_API)) { + std::stringstream os; + os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum() + << " hipLaunchKernel '" << kernel_name << "'" + << " gridDim:" << num_blocks + << " groupDim:" << dim_blocks + << " sharedMem:+" << group_mem_bytes + << " " << *stream; + + if (HIP_PROFILE_API == 0x1) { + std::string shortAtpString("hipLaunchKernel:"); + shortAtpString += kernel_name; + MARKER_BEGIN(shortAtpString.c_str(), "HIP"); + } else if (HIP_PROFILE_API == 0x2) { + MARKER_BEGIN(os.str().c_str(), "HIP"); + } + + if (COMPILE_HIP_DB && HIP_TRACE_API) { + std::cerr << API_COLOR << os.str() << API_COLOR_END + << std::endl; + } + } + } + void unlock_stream_hip_( hipStream_t stream, void* locked_stream, From a5ca430e5cf0cae4dbb28a2f2f389aa993a847c2 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Fri, 7 Apr 2017 14:17:41 -0500 Subject: [PATCH 021/171] Add more operator overloading for float2 type, contributed by Aditya Change-Id: If1ab7fb24d64bb5304142aed0951c9bd5ad47d20 --- include/hip/hcc_detail/hip_vector_types.h | 38 ++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h index 82bd3b2d6f..35c6c23548 100644 --- a/include/hip/hcc_detail/hip_vector_types.h +++ b/include/hip/hcc_detail/hip_vector_types.h @@ -1270,6 +1270,15 @@ __device__ __host__ static inline type operator op (type& val, int) { \ #define DECLOP_1VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ return lhs.x op rhs.x; \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return lhs.x op rhs.x; \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return lhs.x op rhs.x ; \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return lhs.x op rhs.x ; \ } #define DECLOP_1VAR_1IN_1OUT(type, op) \ @@ -1338,6 +1347,15 @@ __device__ __host__ static inline type operator op (type& val, int) { \ #define DECLOP_2VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ } #define DECLOP_2VAR_1IN_1OUT(type, op) \ @@ -1415,7 +1433,16 @@ __device__ __host__ static inline type operator op (type& val, int) { \ #define DECLOP_3VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ #define DECLOP_3VAR_1IN_1OUT(type, op) \ __device__ __host__ static inline type operator op(type &rhs) { \ @@ -1500,6 +1527,15 @@ __device__ __host__ static inline type operator op (type& val, int) { \ #define DECLOP_4VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ } #define DECLOP_4VAR_1IN_1OUT(type, op) \ From a9fd0d4e0df599ede9708f2272894e9b1b5c68da Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Fri, 7 Apr 2017 16:29:25 -0500 Subject: [PATCH 022/171] Update the define of __global__ for GGL Change-Id: I563bb2a132403bcbe9e9f279b55406cf0255af7d --- include/hip/hcc_detail/host_defines.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h index b0a7421d18..5864cfa0e7 100644 --- a/include/hip/hcc_detail/host_defines.h +++ b/include/hip/hcc_detail/host_defines.h @@ -48,7 +48,7 @@ THE SOFTWARE. #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else //#warning "GGL global define reached" -#define __global__ [[hc]] __attribute__((weak)) +#define __global__ __attribute__((hc, weak)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) From 2848d7a22289841e3c4fc235d481832ac515e23f Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Mon, 10 Apr 2017 08:53:12 -0500 Subject: [PATCH 023/171] add math.h to cover sqrtf function Change-Id: Ia37752710cea4ca77e0a4e61f8e69a0355d9488d --- include/hip/hcc_detail/hip_complex.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/hip/hcc_detail/hip_complex.h b/include/hip/hcc_detail/hip_complex.h index 26d73a21a8..c76d65b058 100644 --- a/include/hip/hcc_detail/hip_complex.h +++ b/include/hip/hcc_detail/hip_complex.h @@ -24,6 +24,7 @@ THE SOFTWARE. #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H #include "hip/hcc_detail/hip_vector_types.h" +#include #if __cplusplus #define COMPLEX_ADD_OP_OVERLOAD(type) \ From 71447dacad2accae408d6809591316085b79122b Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Mon, 10 Apr 2017 11:17:05 -0500 Subject: [PATCH 024/171] Fix ifndef guard in hip_fp16.h Change-Id: I0215556e7aa98a74e8a984e4de3fb6e8cafdfb24 --- include/hip/hip_fp16.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/hip/hip_fp16.h b/include/hip/hip_fp16.h index 0e002d9396..95879dba50 100644 --- a/include/hip/hip_fp16.h +++ b/include/hip/hip_fp16.h @@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifdef HIP_INCLUDE_HIP_HIP_FP16_H +#ifndef HIP_INCLUDE_HIP_HIP_FP16_H #define HIP_INCLUDE_HIP_HIP_FP16_H #include From 310c130fc6432de01e1e20569e994467c4e0436b Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 9 Apr 2017 20:51:56 -0500 Subject: [PATCH 025/171] Doc update for Serialization. Describe workaround for partial specialization --- docs/markdown/hip_bugs.md | 81 +++++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 28 deletions(-) diff --git a/docs/markdown/hip_bugs.md b/docs/markdown/hip_bugs.md index 73133843bc..9452fae2fd 100644 --- a/docs/markdown/hip_bugs.md +++ b/docs/markdown/hip_bugs.md @@ -1,5 +1,4 @@ -# HIP Bugs - +# HIP Bugs - [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`](#errors-related-to-undefined-reference-to-__hclaunchkernel____grid_launch_parm) @@ -41,60 +40,86 @@ For example, `Foo` in the code snippets below contains an array-typed member var ``` struct Foo { + float _data; // table is an array, which makes foo int table[3]; }; ``` -An workaround is to provide a custom serializer on CPU side, and append the contents of the array as kernel arguments: +A workaround is to provide a custom serializer on host side which appends the contents of the array as kernel arguments, and a custome deserializaer on the device path to reconstruct the array inside the GPU kernels. +The deserializer can not be a function template, and should have scalar-typed parameters of the number equals to the length of the array-typed member variable. For example: ``` struct Foo { - int table[3]; + float _data; + int _table[3]; + - // user-provided CPU serializer - // must append the contents of the array member as kernel arguments #ifdef __HCC__ + // user-provided CPU serializer + // Append the contents of the array member as kernel arguments __attribute__((annotate(“serialize”))) void __cxxamp_serialize(Kalmar::Serialize &s) const { + s.Append(sizeof(float), &_data); for (int i = 0; i < 3; ++i) - s.Append(sizeof(int), &table[i]); + s.Append(sizeof(int), &_table[i]); } -#endif -}; -``` -Then, provide a custom deserializer on GPU side, to help reconstruct the array within GPU kernels. Notice that the deserializer can not be a function template, and should have scalar-typed parameters of the number equals to the length of the array-typed member variable. For example: - -``` -struct Foo { - int table[3]; // user-provided GPU deserializer // table has 3 int elements, so deserializer must have 3 int parameters. -#ifdef __HCC__ __attribute__((annotate(“user_deserialize”))) - Foo(int x0, int x1, int x2) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - table[2] = x2; + Foo(float d, int x0, int x1, int x2) [[cpu]][[hc]] { + _data = d; + _table[0] = x0; + _table[1] = x1; + _table[2] = x2; } -#endif -#ifdef __HCC__ - __attribute__((annotate(“serialize”))) - void __cxxamp_serialize(Kalmar::Serialize &s) const { - s.Append(sizeof(int), &table[0]); - s.Append(sizeof(int), &table[1]); - s.Append(sizeof(int), &table[2]); - } #endif }; ``` Rather than create serializer functions, another workaround is to pass the member fields from the structure as simple data types. +Note a class or struct can contain only one "user_deserialize" constructor. +For types which contain arrays which are based on template parameter, you can use partial template instantiation to implement one constructor per specialization. +However, an easier approach may be to create one user_deserializer which processes the maximum supported dimension. +This will take more memory in the structure and also require additional kernel arguments, but this may have little performance impact and the conversion is easier than partial template specialization. An example: + +``` +#define MAX_Dim 4 +template struct MyArray { + + T* dataPtr_; + //int size_[Dim]; // Original code with template-sized Dims + int size_[MAX_dim]; // Workaround code - allocate an array big enough for all dims so one serializer works. + + +... + +#ifdef __HCC__ + __attribute__((annotate("serialize"))) + void __cxxamp_serialize(Kalmar::Serialize &s) const { + s.Append(sizeof(float), &_dataPtr); + for (int i=0; i Date: Tue, 11 Apr 2017 01:16:28 +0000 Subject: [PATCH 026/171] Add integer abs (initial implementation, can be optimized with OCML) Change-Id: I1f568c8c0e2333af1fda4c313dc48ea0c5b6ab00 --- include/hip/hcc_detail/math_functions.h | 1 + src/math_functions.cpp | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/include/hip/hcc_detail/math_functions.h b/include/hip/hcc_detail/math_functions.h index c3b8186fd3..9faff2743a 100644 --- a/include/hip/hcc_detail/math_functions.h +++ b/include/hip/hcc_detail/math_functions.h @@ -51,6 +51,7 @@ __device__ float exp10f(float x); __device__ float exp2f(float x); __device__ float expf(float x); __device__ float expm1f(float x); +__device__ int abs(int x); __device__ float fabsf(float x); __device__ float fdimf(float x, float y); __device__ float fdividef(float x, float y); diff --git a/src/math_functions.cpp b/src/math_functions.cpp index 92cc8689fc..3472216309 100644 --- a/src/math_functions.cpp +++ b/src/math_functions.cpp @@ -114,6 +114,10 @@ __device__ float expm1f(float x) { return hc::precise_math::expm1f(x); } +__device__ int abs(int x) +{ + return x >= 0 ? x : -x; // TODO - optimize with OCML +} __device__ float fabsf(float x) { return hc::precise_math::fabsf(x); From 7bb378bd4d2e7160887d3c940f8dc6b59397b5aa Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 13 Apr 2017 10:34:33 +0530 Subject: [PATCH 027/171] Add hip-config.cmake to hip_hcc package Due to the way hip packages are generated, for the purpose of packaging hip-targets*.cmake are not generated at build time. However hip-config*.cmake are generated at build time. This will be fixed in future. Change-Id: I5d79bc58a4f7a324ae06457130d8372ffe403830 --- packaging/hip-targets-release.cmake | 41 +++++++++++ packaging/hip-targets.cmake | 102 ++++++++++++++++++++++++++++ packaging/hip_hcc.txt | 2 + 3 files changed, 145 insertions(+) create mode 100644 packaging/hip-targets-release.cmake create mode 100644 packaging/hip-targets.cmake diff --git a/packaging/hip-targets-release.cmake b/packaging/hip-targets-release.cmake new file mode 100644 index 0000000000..ba0a5005f5 --- /dev/null +++ b/packaging/hip-targets-release.cmake @@ -0,0 +1,41 @@ +#---------------------------------------------------------------- +# Generated CMake target import file for configuration "Release". +#---------------------------------------------------------------- + +# Commands may need to know the format version. +set(CMAKE_IMPORT_FILE_VERSION 1) + +# Import target "hip::hip_hcc_static" for configuration "Release" +set_property(TARGET hip::hip_hcc_static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) +set_target_properties(hip::hip_hcc_static PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX" + IMPORTED_LINK_INTERFACE_LIBRARIES_RELEASE "hc_am" + IMPORTED_LOCATION_RELEASE "/opt/rocm/hip/lib/libhip_hcc_static.a" + ) + +list(APPEND _IMPORT_CHECK_TARGETS hip::hip_hcc_static ) +list(APPEND _IMPORT_CHECK_FILES_FOR_hip::hip_hcc_static "/opt/rocm/hip/lib/libhip_hcc_static.a" ) + +# Import target "hip::hip_hcc" for configuration "Release" +set_property(TARGET hip::hip_hcc APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) +set_target_properties(hip::hip_hcc PROPERTIES + IMPORTED_LINK_INTERFACE_LIBRARIES_RELEASE "hcc::hccrt;hcc::hc_am" + IMPORTED_LOCATION_RELEASE "/opt/rocm/hip/lib/libhip_hcc.so" + IMPORTED_SONAME_RELEASE "libhip_hcc.so" + ) + +list(APPEND _IMPORT_CHECK_TARGETS hip::hip_hcc ) +list(APPEND _IMPORT_CHECK_FILES_FOR_hip::hip_hcc "/opt/rocm/hip/lib/libhip_hcc.so" ) + +# Import target "hip::hip_device" for configuration "Release" +set_property(TARGET hip::hip_device APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) +set_target_properties(hip::hip_device PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX" + IMPORTED_LOCATION_RELEASE "/opt/rocm/hip/lib/libhip_device.a" + ) + +list(APPEND _IMPORT_CHECK_TARGETS hip::hip_device ) +list(APPEND _IMPORT_CHECK_FILES_FOR_hip::hip_device "/opt/rocm/hip/lib/libhip_device.a" ) + +# Commands beyond this point should not need to know the version. +set(CMAKE_IMPORT_FILE_VERSION) diff --git a/packaging/hip-targets.cmake b/packaging/hip-targets.cmake new file mode 100644 index 0000000000..65370eec9e --- /dev/null +++ b/packaging/hip-targets.cmake @@ -0,0 +1,102 @@ +# Generated by CMake 3.5.1 + +if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.5) + message(FATAL_ERROR "CMake >= 2.6.0 required") +endif() +cmake_policy(PUSH) +cmake_policy(VERSION 2.6) +#---------------------------------------------------------------- +# Generated CMake target import file. +#---------------------------------------------------------------- + +# Commands may need to know the format version. +set(CMAKE_IMPORT_FILE_VERSION 1) + +# Protect against multiple inclusion, which would fail when already imported targets are added once more. +set(_targetsDefined) +set(_targetsNotDefined) +set(_expectedTargets) +foreach(_expectedTarget hip::hip_hcc_static hip::hip_hcc hip::hip_device) + list(APPEND _expectedTargets ${_expectedTarget}) + if(NOT TARGET ${_expectedTarget}) + list(APPEND _targetsNotDefined ${_expectedTarget}) + endif() + if(TARGET ${_expectedTarget}) + list(APPEND _targetsDefined ${_expectedTarget}) + endif() +endforeach() +if("${_targetsDefined}" STREQUAL "${_expectedTargets}") + set(CMAKE_IMPORT_FILE_VERSION) + cmake_policy(POP) + return() +endif() +if(NOT "${_targetsDefined}" STREQUAL "") + message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_targetsDefined}\nTargets not yet defined: ${_targetsNotDefined}\n") +endif() +unset(_targetsDefined) +unset(_targetsNotDefined) +unset(_expectedTargets) + + +# The installation prefix configured by this project. +set(_IMPORT_PREFIX "/opt/rocm/hip") + +# Create imported target hip::hip_hcc_static +add_library(hip::hip_hcc_static STATIC IMPORTED) + +set_target_properties(hip::hip_hcc_static PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" +) + +# Create imported target hip::hip_hcc +add_library(hip::hip_hcc SHARED IMPORTED) + +set_target_properties(hip::hip_hcc PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" +) + +# Create imported target hip::hip_device +add_library(hip::hip_device STATIC IMPORTED) + +set_target_properties(hip::hip_device PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" +) + +# Load information for each installed configuration. +get_filename_component(_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) +file(GLOB CONFIG_FILES "${_DIR}/hip-targets-*.cmake") +foreach(f ${CONFIG_FILES}) + include(${f}) +endforeach() + +# Cleanup temporary variables. +set(_IMPORT_PREFIX) + +# Loop over all imported files and verify that they actually exist +foreach(target ${_IMPORT_CHECK_TARGETS} ) + foreach(file ${_IMPORT_CHECK_FILES_FOR_${target}} ) + if(NOT EXISTS "${file}" ) + message(FATAL_ERROR "The imported target \"${target}\" references the file + \"${file}\" +but this file does not exist. Possible reasons include: +* The file was deleted, renamed, or moved to another location. +* An install or uninstall procedure did not complete successfully. +* The installation package was faulty and contained + \"${CMAKE_CURRENT_LIST_FILE}\" +but not all the files it references. +") + endif() + endforeach() + unset(_IMPORT_CHECK_FILES_FOR_${target}) +endforeach() +unset(_IMPORT_CHECK_TARGETS) + +# This file does not depend on other imported targets which have +# been exported from the same project but in a separate export set. + +# Commands beyond this point should not need to know the version. +set(CMAKE_IMPORT_FILE_VERSION) +cmake_policy(POP) diff --git a/packaging/hip_hcc.txt b/packaging/hip_hcc.txt index 7dd65033fd..7118c32eb9 100644 --- a/packaging/hip_hcc.txt +++ b/packaging/hip_hcc.txt @@ -6,6 +6,8 @@ install(FILES @PROJECT_BINARY_DIR@/libhip_hcc_static.a DESTINATION lib) install(FILES @PROJECT_BINARY_DIR@/libhip_device.a DESTINATION lib) install(FILES @PROJECT_BINARY_DIR@/.hipInfo DESTINATION lib) install(FILES @hip_SOURCE_DIR@/src/hip_hc.ll @hip_SOURCE_DIR@/src/hip_hc_gfx803.ll DESTINATION lib) +install(FILES @PROJECT_BINARY_DIR@/hip-config.cmake @PROJECT_BINARY_DIR@/hip-config-version.cmake DESTINATION lib/cmake/hip) +install(FILES @hip_SOURCE_DIR@/packaging/hip-targets.cmake @hip_SOURCE_DIR@/packaging/hip-targets-release.cmake DESTINATION lib/cmake/hip) ############################# # Packaging steps From 83097e9da4b79ffef9499ee3cf696b55ca79de5a Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 13 Apr 2017 12:12:04 +0530 Subject: [PATCH 028/171] dtests should ignore HIP_PATH env var Change-Id: I27b1cdab6e6b799987dad3ce97b56c764b1b8867 --- CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 94ed2a7562..b4e80625d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -372,14 +372,14 @@ endif() # Testing steps ############################# # Target: test -set(HIP_PATH ${CMAKE_INSTALL_PREFIX}) +set(HIP_ROOT_DIR ${CMAKE_INSTALL_PREFIX}) set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}) -execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_PATH}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET) +execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET) if(${RUN_HIT} EQUAL 0) - execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_PATH}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET) + execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET) endif() if(${RUN_HIT} EQUAL 0) - set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + set(CMAKE_MODULE_PATH "${HIP_ROOT_DIR}/cmake" ${CMAKE_MODULE_PATH}) include(${HIP_SRC_PATH}/tests/hit/HIT.cmake) # Add tests From bfa08cd49a94a9cf04e43bbe4e1e6d4950acb0a1 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 13 Apr 2017 12:38:38 +0530 Subject: [PATCH 029/171] Do not rebuild cmake cache by default Change-Id: Ie21e99beaa3465b54b5a6a77439c455f34de98b3 --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b4e80625d9..eee1a14a8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -142,7 +142,7 @@ add_to_config(_buildInfo COMPILE_HIP_ATP_MARKER) # Build steps ############################# # Rebuild cmake cache updates .hipInfo and .hipVersion -add_custom_target(update_build_and_version_info ALL COMMAND make rebuild_cache) +add_custom_target(update_build_and_version_info COMMAND make rebuild_cache) # Build clang hipify if enabled add_subdirectory(hipify-clang) From 0a9feb4f61ffae996957692484a8dcb4605527e1 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 13 Apr 2017 14:41:11 +0530 Subject: [PATCH 030/171] FindHIP: Handle remove_item from empty lists Change-Id: I6adf31b32edeae9e8454b1a2528064cf3985fca1 --- cmake/FindHIP.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/FindHIP.cmake b/cmake/FindHIP.cmake index 0001436fee..1d71238ce6 100644 --- a/cmake/FindHIP.cmake +++ b/cmake/FindHIP.cmake @@ -514,7 +514,7 @@ macro(HIP_ADD_EXECUTABLE hip_target) # Separate the sources from the options HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) - list(REMOVE_ITEM _sources ${_source_files}) + list(REMOVE_ITEM _sources "${_source_files}") if("x${HCC_HOME}" STREQUAL "x") set(HCC_HOME "/opt/rocm/hcc") endif() @@ -530,7 +530,7 @@ macro(HIP_ADD_LIBRARY hip_target) # Separate the sources from the options HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) - list(REMOVE_ITEM _sources ${_source_files}) + list(REMOVE_ITEM _sources "${_source_files}") add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX}) endmacro() From 4f363df159740277d11ec8224bf17937dbb30c03 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 16 Apr 2017 14:22:48 -0500 Subject: [PATCH 031/171] Update bugs - Add CreateKernel, new signature for static kerns. --- docs/markdown/hip_bugs.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/docs/markdown/hip_bugs.md b/docs/markdown/hip_bugs.md index 9452fae2fd..abb31d80e8 100644 --- a/docs/markdown/hip_bugs.md +++ b/docs/markdown/hip_bugs.md @@ -11,7 +11,13 @@ ### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**` Some common code practices may lead to hipcc generating a error with the form : +``` undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_ +``` +Or: +``` +error: weak declaration cannot have internal linkage +``` Suggested workarounds: - Avoid use of static with kernel definition: @@ -26,6 +32,19 @@ namespace { } ``` +### Can't find kernels inside dynamic linked library + +HCC requires use of the "-Bdynamic" flag when creating a dynamic library which contains kernels. The dynamic flag causes the symbols to be created with a signature which allows HCC to discover and load the kernels in the dynamic library. This flag is often not set by default and must be added to the link step of the library. If not done, HCC will be unable to find the kernels defined in the library, and will emit a message such as: + +``` +HSADevice::CreateKernel(): Unable to create kernel" +``` + +To correct, add the following flag to hcc or hipcc: +``` +$ hipcc -Wl,-Bsymbolic ... +``` + ### What is the current limitation of HIP Generic Grid Launch method? 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak))); From 486716a400347a1a4488261712c886bd57ad26ce Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 18 Apr 2017 10:21:20 +0530 Subject: [PATCH 032/171] FindHIP: Apply remove_item on non-empty lists only Change-Id: Ib7fcb992d7e1bb679d4d86676fe3d980ba204815 --- cmake/FindHIP.cmake | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmake/FindHIP.cmake b/cmake/FindHIP.cmake index 1d71238ce6..5a5813ba0d 100644 --- a/cmake/FindHIP.cmake +++ b/cmake/FindHIP.cmake @@ -514,7 +514,9 @@ macro(HIP_ADD_EXECUTABLE hip_target) # Separate the sources from the options HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) - list(REMOVE_ITEM _sources "${_source_files}") + if(_source_files) + list(REMOVE_ITEM _sources ${_source_files}) + endif() if("x${HCC_HOME}" STREQUAL "x") set(HCC_HOME "/opt/rocm/hcc") endif() @@ -530,7 +532,9 @@ macro(HIP_ADD_LIBRARY hip_target) # Separate the sources from the options HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) - list(REMOVE_ITEM _sources "${_source_files}") + if(_source_files) + list(REMOVE_ITEM _sources ${_source_files}) + endif() add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX}) endmacro() From 7699aeb3d37576ea3fdfb0eb9f2333386f9415a6 Mon Sep 17 00:00:00 2001 From: James Edwards Date: Mon, 17 Apr 2017 23:58:34 -0500 Subject: [PATCH 033/171] Fix RPM HIP packages from specifying /opt Change-Id: Iec3c3b81eef4c8888d425eefc80b12488a8d20a1 --- packaging/hip_base.txt | 1 + packaging/hip_doc.txt | 1 + packaging/hip_hcc.txt | 1 + packaging/hip_nvcc.txt | 1 + packaging/hip_samples.txt | 1 + 5 files changed, 5 insertions(+) diff --git a/packaging/hip_base.txt b/packaging/hip_base.txt index a208bc3463..836a82657b 100644 --- a/packaging/hip_base.txt +++ b/packaging/hip_base.txt @@ -33,5 +33,6 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/packaging/hip_doc.txt b/packaging/hip_doc.txt index d5a0c471b1..6f602c84cf 100644 --- a/packaging/hip_doc.txt +++ b/packaging/hip_doc.txt @@ -36,5 +36,6 @@ set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/packaging/hip_hcc.txt b/packaging/hip_hcc.txt index 7118c32eb9..b0808aa0bc 100644 --- a/packaging/hip_hcc.txt +++ b/packaging/hip_hcc.txt @@ -46,5 +46,6 @@ if(@COMPILE_HIP_ATP_MARKER@) else() set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@") endif() +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/packaging/hip_nvcc.txt b/packaging/hip_nvcc.txt index ea4943f282..0d7c357623 100644 --- a/packaging/hip_nvcc.txt +++ b/packaging/hip_nvcc.txt @@ -25,5 +25,6 @@ set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") #set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, cuda >= 7.5") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/packaging/hip_samples.txt b/packaging/hip_samples.txt index f289f2a8e5..6d34a6fd40 100644 --- a/packaging/hip_samples.txt +++ b/packaging/hip_samples.txt @@ -24,5 +24,6 @@ set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) From 1b6d08ada0b9b4a2c044aca93ede1c70cf8bbbd3 Mon Sep 17 00:00:00 2001 From: James Edwards Date: Mon, 17 Apr 2017 23:58:34 -0500 Subject: [PATCH 034/171] Fix RPM HIP packages from specifying /opt Change-Id: Iec3c3b81eef4c8888d425eefc80b12488a8d20a1 --- packaging/hip_base.txt | 1 + packaging/hip_doc.txt | 1 + packaging/hip_hcc.txt | 1 + packaging/hip_nvcc.txt | 1 + packaging/hip_samples.txt | 1 + 5 files changed, 5 insertions(+) diff --git a/packaging/hip_base.txt b/packaging/hip_base.txt index a208bc3463..836a82657b 100644 --- a/packaging/hip_base.txt +++ b/packaging/hip_base.txt @@ -33,5 +33,6 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/packaging/hip_doc.txt b/packaging/hip_doc.txt index d5a0c471b1..6f602c84cf 100644 --- a/packaging/hip_doc.txt +++ b/packaging/hip_doc.txt @@ -36,5 +36,6 @@ set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/packaging/hip_hcc.txt b/packaging/hip_hcc.txt index 7118c32eb9..b0808aa0bc 100644 --- a/packaging/hip_hcc.txt +++ b/packaging/hip_hcc.txt @@ -46,5 +46,6 @@ if(@COMPILE_HIP_ATP_MARKER@) else() set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@") endif() +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/packaging/hip_nvcc.txt b/packaging/hip_nvcc.txt index ea4943f282..0d7c357623 100644 --- a/packaging/hip_nvcc.txt +++ b/packaging/hip_nvcc.txt @@ -25,5 +25,6 @@ set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") #set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, cuda >= 7.5") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/packaging/hip_samples.txt b/packaging/hip_samples.txt index f289f2a8e5..6d34a6fd40 100644 --- a/packaging/hip_samples.txt +++ b/packaging/hip_samples.txt @@ -24,5 +24,6 @@ set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) From fa15ee3ccd0026c96eb7019439c3016b085b3a64 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Wed, 19 Apr 2017 10:47:40 -0500 Subject: [PATCH 035/171] fix broken header in NV path Change-Id: Ia3aff2a89d9ba49547f51ce03a3304dfab58ba25 --- include/hip/nvcc_detail/hip_runtime_api.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index 7e881df3ab..0cc40f32af 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -948,4 +948,6 @@ inline static hipChannelFormatDesc hipCreateChannelDesc() { return cudaCreateChannelDesc(); } -#endif +#endif //__CUDACC__ + +#endif //HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_API_H From f4287a29b21a1651ebeb88857c553bf50d4308c9 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Wed, 19 Apr 2017 10:59:55 -0500 Subject: [PATCH 036/171] add support of hipLaunchKernelGGL on NV path Change-Id: I0aeafd80c2181873be385d985f1d8ed86a98d136 --- include/hip/nvcc_detail/hip_runtime.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/hip/nvcc_detail/hip_runtime.h b/include/hip/nvcc_detail/hip_runtime.h index b4fa13f48c..80da388007 100644 --- a/include/hip/nvcc_detail/hip_runtime.h +++ b/include/hip/nvcc_detail/hip_runtime.h @@ -36,6 +36,10 @@ do {\ kernelName<<>>(0, ##__VA_ARGS__);\ } while(0) +#define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...) \ +do {\ +kernelName<<>>(__VA_ARGS__);\ +} while(0) #define hipReadModeElementType cudaReadModeElementType From 6d5ac3fbac0ff7bd7ae20bab05b050b226013d84 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 21 Apr 2017 09:01:34 -0500 Subject: [PATCH 037/171] Fix compilation error with nvcc (c++ nullptr) --- samples/0_Intro/square/Makefile | 1 + samples/0_Intro/square/square.hipref.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/samples/0_Intro/square/Makefile b/samples/0_Intro/square/Makefile index 1e8cdba080..aa48cc5864 100644 --- a/samples/0_Intro/square/Makefile +++ b/samples/0_Intro/square/Makefile @@ -15,5 +15,6 @@ square.hip.out: square.hipref.cpp + clean: rm -f *.o *.out diff --git a/samples/0_Intro/square/square.hipref.cpp b/samples/0_Intro/square/square.hipref.cpp index 963ab63260..e694bfb8a4 100644 --- a/samples/0_Intro/square/square.hipref.cpp +++ b/samples/0_Intro/square/square.hipref.cpp @@ -83,7 +83,7 @@ int main(int argc, char *argv[]) const unsigned threadsPerBlock = 256; printf ("info: launch 'vector_square' kernel\n"); - hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, nullptr, C_d, A_d, N); + hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N); printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); From 3c1dd246d67cbc61280733ac6b74cf2401ca3af9 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 21 Apr 2017 21:46:30 +0300 Subject: [PATCH 038/171] [HIPIFY] Initial sync HIPIFY with HIP by CUDA Driver API data types. + Introduce CUDA_Driver_API_functions_supported_by_HIP.md. + Initial update of HIPIFY with CUDA driver data types. + Initial sync HIP types against CUDA Driver and Runtime API types. + Typo fixes. --- ...A_Driver_API_functions_supported_by_HIP.md | 499 ++++++++ hipify-clang/src/Cuda2Hip.cpp | 1044 +++++++++++------ 2 files changed, 1157 insertions(+), 386 deletions(-) create mode 100644 docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md diff --git a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md new file mode 100644 index 0000000000..3434d29a70 --- /dev/null +++ b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -0,0 +1,499 @@ +# CUDA Driver API functions supported by HIP + +## **1. Data types used by CUDA driver** + +| **type** | **CUDA** | **HIP** | **CUDA description** | +|-------------:|---------------------------------------------------------------|------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| struct | `CUDA_ARRAY3D_DESCRIPTOR` | | | +| struct | `CUDA_ARRAY_DESCRIPTOR` | | | +| struct | `CUDA_MEMCPY2D` | | | +| struct | `CUDA_MEMCPY3D` | | | +| struct | `CUDA_MEMCPY3D_PEER` | | | +| struct | `CUDA_POINTER_ATTRIBUTE_P2P_TOKENS` | | | +| struct | `CUDA_RESOURCE_DESC` | | | +| struct | `CUDA_RESOURCE_VIEW_DESC` | | | +| struct | `CUdevprop` | `hipDeviceProp_t` | | +| struct | `CUipcEventHandle` | | | +| struct | `CUipcMemHandle` | | | +| enum |***`CUaddress_mode`*** | | Texture reference addressing modes | +| 0 |*`CU_TR_ADDRESS_MODE_WRAP`* | | Wrapping address mode | +| 1 |*`CU_TR_ADDRESS_MODE_CLAMP`* | | Clamp to edge address mode | +| 2 |*`CU_TR_ADDRESS_MODE_MIRROR`* | | Mirror address mode | +| 3 |*`CU_TR_ADDRESS_MODE_BORDER`* | | Border address mode | +| enum |***`CUarray_cubemap_face`*** | | Array indices for cube faces | +| 0x00 |*`CU_CUBEMAP_FACE_POSITIVE_X`* | | Positive X face of cubemap | +| 0x01 |*`CU_CUBEMAP_FACE_NEGATIVE_X`* | | Negative X face of cubemap | +| 0x02 |*`CU_CUBEMAP_FACE_POSITIVE_Y`* | | Positive Y face of cubemap | +| 0x03 |*`CU_CUBEMAP_FACE_NEGATIVE_Y`* | | Negative Y face of cubemap | +| 0x04 |*`CU_CUBEMAP_FACE_POSITIVE_Z`* | | Positive Z face of cubemap | +| 0x05 |*`CU_CUBEMAP_FACE_NEGATIVE_Z`* | | Negative Z face of cubemap | +| enum |***`CUarray_format`*** | | Array formats | +| 0x01 |*`CU_AD_FORMAT_UNSIGNED_INT8`* | | Unsigned 8-bit integers | +| 0x02 |*`CU_AD_FORMAT_UNSIGNED_INT16`* | | Unsigned 16-bit integers | +| 0x03 |*`CU_AD_FORMAT_UNSIGNED_INT32`* | | Unsigned 32-bit integers | +| 0x08 |*`CU_AD_FORMAT_SIGNED_INT8`* | | Signed 8-bit integers | +| 0x09 |*`CU_AD_FORMAT_SIGNED_INT16`* | | Signed 16-bit integers | +| 0x0a |*`CU_AD_FORMAT_SIGNED_INT32`* | | Signed 32-bit integers | +| 0x10 |*`CU_AD_FORMAT_HALF`* | | 16-bit floating point | +| 0x20 |*`CU_AD_FORMAT_FLOAT`* | | 32-bit floating point | +| enum |***`CUctx_flags`*** | | Context creation flags | +| 0x00 |*`CU_CTX_SCHED_AUTO`* | | Automatic scheduling | +| 0x01 |*`CU_CTX_SCHED_SPIN`* | | Set spin as default scheduling | +| 0x02 |*`CU_CTX_SCHED_YIELD`* | | Set yield as default scheduling | +| 0x04 |*`CU_CTX_SCHED_BLOCKING_SYNC`* | | Set blocking synchronization as default scheduling | +| 0x04 |*`CU_CTX_BLOCKING_SYNC`* | | Set blocking synchronization as default scheduling Deprecated. This flag was deprecated as of CUDA 4.0 and was replaced with CU_CTX_SCHED_BLOCKING_SYNC.| +| 0x07 |*`CU_CTX_SCHED_MASK`* | | | +| 0x08 |*`CU_CTX_MAP_HOST`* | | Support mapped pinned allocations | +| 0x10 |*`CU_CTX_LMEM_RESIZE_TO_MAX`* | | Keep local memory allocation after launch | +| 0x1f |*`CU_CTX_FLAGS_MASK`* | | | +| enum |***`CUdevice_attribute`*** | | Device properties | +| 1 |*`CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK`* |*`hipDeviceAttributeMaxThreadsPerBlock`* | Maximum number of threads per block | +| 2 |*`CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X`* |*`hipDeviceAttributeMaxBlockDimX`* | Maximum block dimension X | +| 3 |*`CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y`* |*`hipDeviceAttributeMaxBlockDimY`* | Maximum block dimension Y | +| 4 |*`CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z`* |*`hipDeviceAttributeMaxBlockDimZ`* | Maximum block dimension Z | +| 5 |*`CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X`* |*`hipDeviceAttributeMaxGridDimX`* | Maximum grid dimension X | +| 6 |*`CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y`* |*`hipDeviceAttributeMaxGridDimY`* | Maximum grid dimension Y | +| 7 |*`CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z`* |*`hipDeviceAttributeMaxGridDimZ`* | Maximum grid dimension Y | +| 8 |*`CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK`* |*`hipDeviceAttributeMaxSharedMemoryPerBlock`* | Maximum shared memory available per block in bytes | +| 8 |*`CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK`* |*`hipDeviceAttributeMaxSharedMemoryPerBlock`* | Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK | +| 9 |*`CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY`* |*`hipDeviceAttributeTotalConstantMemory`* | Memory available on device for __constant__ variables in a CUDA C kernel in bytes | +| 10 |*`CU_DEVICE_ATTRIBUTE_WARP_SIZE`* |*`hipDeviceAttributeWarpSize`* | Warp size in threads | +| 11 |*`CU_DEVICE_ATTRIBUTE_MAX_PITCH`* | | Maximum pitch in bytes allowed by memory copies | +| 12 |*`CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK`* |*`hipDeviceAttributeMaxRegistersPerBlock`* | Maximum number of 32-bit registers available per block | +| 12 |*`CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK`* |*`hipDeviceAttributeMaxRegistersPerBlock`* | Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK | +| 13 |*`CU_DEVICE_ATTRIBUTE_CLOCK_RATE`* |*`hipDeviceAttributeClockRate`* | Typical clock frequency in kilohertz | +| 14 |*`CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT`* | | Alignment requirement for textures | +| 15 |*`CU_DEVICE_ATTRIBUTE_GPU_OVERLAP`* | | Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT| +| 16 |*`CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT`* |*`hipDeviceAttributeMultiprocessorCount`* | Number of multiprocessors on device | +| 17 |*`CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT`* | | Specifies whether there is a run time limit on kernels | +| 18 |*`CU_DEVICE_ATTRIBUTE_INTEGRATED`* | | Device is integrated with host memory | +| 19 |*`CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY`* | | Device can map host memory into CUDA address space | +| 20 |*`CU_DEVICE_ATTRIBUTE_COMPUTE_MODE`* |*`hipDeviceAttributeComputeMode`* | Compute mode (See CUcomputemode for details) | +| 21 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH`* | | Maximum 1D texture width | +| 22 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH`* | | Maximum 2D texture width | +| 23 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT`* | | Maximum 2D texture height | +| 24 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH`* | | Maximum 3D texture width | +| 25 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT`* | | Maximum 3D texture height | +| 26 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH`* | | Maximum 3D texture depth | +| 27 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH`* | | Maximum 2D layered texture width | +| 28 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT`* | | Maximum 2D layered texture height | +| 29 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS`* | | Maximum layers in a 2D layered texture | +| 27 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH`* | | Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH | +| 28 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT`* | | Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT | +| 29 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES`* | | Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS | +| 30 |*`CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT`* | | Alignment requirement for surfaces | +| 31 |*`CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS`* |*`hipDeviceAttributeConcurrentKernels`* | Device can possibly execute multiple kernels concurrently | +| 32 |*`CU_DEVICE_ATTRIBUTE_ECC_ENABLED`* | | Device has ECC support enabled | +| 33 |*`CU_DEVICE_ATTRIBUTE_PCI_BUS_ID`* |*`hipDeviceAttributePciBusId`* | PCI bus ID of the device | +| 34 |*`CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID`* |*`hipDeviceAttributePciDeviceId`* | PCI device ID of the device | +| 35 |*`CU_DEVICE_ATTRIBUTE_TCC_DRIVER`* | | Device is using TCC driver model | +| 36 |*`CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE`* |*`hipDeviceAttributeMemoryClockRate`* | Peak memory clock frequency in kilohertz | +| 37 |*`CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH`* |*`hipDeviceAttributeMemoryBusWidth`* | Global memory bus width in bits | +| 38 |*`CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE`* |*`hipDeviceAttributeL2CacheSize`* | Size of L2 cache in bytes | +| 39 |*`CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR`* |*`hipDeviceAttributeMaxThreadsPerMultiProcessor`* | Maximum resident threads per multiprocessor | +| 40 |*`CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT`* | | Number of asynchronous engines | +| 41 |*`CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING`* | | Device shares a unified address space with the host | +| 42 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH`* | | Maximum 1D layered texture width | +| 43 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS`* | | Maximum layers in a 1D layered texture | +| 44 |*`CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER`* | | Deprecated, do not use | +| 45 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH`* | | Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set | +| 46 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT`* | | Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set | +| 47 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE`* | | Alternate maximum 3D texture width | +| 48 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE`* | | Alternate maximum 3D texture height | +| 49 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE`* | | Alternate maximum 3D texture depth | +| 50 |*`CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID`* | | PCI domain ID of the device | +| 51 |*`CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT`* | | Pitch alignment requirement for textures | +| 52 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH`* | | Maximum cubemap texture width/height | +| 53 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH`* | | Maximum cubemap layered texture width/height | +| 54 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS`* | | Maximum layers in a cubemap layered texture | +| 55 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH`* | | Maximum 1D surface width | +| 56 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH`* | | Maximum 2D surface width | +| 57 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT`* | | Maximum 2D surface height | +| 58 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH`* | | Maximum 3D surface width | +| 59 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT`* | | Maximum 3D surface height | +| 60 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH`* | | Maximum 3D surface depth | +| 61 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH`* | | Maximum 1D layered surface width | +| 62 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS`* | | Maximum layers in a 1D layered surface | +| 63 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH`* | | Maximum 2D layered surface width | +| 64 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT`* | | Maximum 2D layered surface height | +| 65 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS`* | | Maximum layers in a 2D layered surface | +| 66 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH`* | | Maximum cubemap surface width | +| 67 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH`* | | Maximum cubemap layered surface width | +| 68 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS`* | | Maximum layers in a cubemap layered surface | +| 69 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH`* | | Maximum 1D linear texture width | +| 70 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH`* | | Maximum 2D linear texture width | +| 71 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT`* | | Maximum 2D linear texture height | +| 72 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH`* | | Maximum 2D linear texture pitch in bytes | +| 73 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH`* | | Maximum mipmapped 2D texture width | +| 74 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT`* | | Maximum mipmapped 2D texture height | +| 75 |*`CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR`* |*`hipDeviceAttributeComputeCapabilityMajor`* | Major compute capability version number | +| 76 |*`CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR`* |*`hipDeviceAttributeComputeCapabilityMinor`* | Minor compute capability version number | +| 77 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH`* | | Maximum mipmapped 1D texture width | +| 78 |*`CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED`* | | Device supports stream priorities | +| 79 |*`CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED`* | | Device supports caching globals in L1 | +| 80 |*`CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED`* | | Device supports caching locals in L1 | +| 81 |*`CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR`* |*`hipDeviceAttributeMaxSharedMemoryPerMultiprocessor`* | Maximum shared memory available per multiprocessor in bytes | +| 82 |*`CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR`* | | Maximum number of 32-bit registers available per multiprocessor | +| 83 |*`CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY`* |*`hipDeviceAttributeManagedMemory`* | Device can allocate managed memory on this system | +| 84 |*`CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD`* | | Device is on a multi-GPU board | +| 85 |*`CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID`* | | Unique id for a group of devices on the same multi-GPU board | +| 86 |*`CU_DEVICE_ATTRIBUTE_MAX`* | | | +| enum |***`CUevent_flags`*** | | Event creation flags | +| 0x00 |*`CU_EVENT_DEFAULT`* |*`hipEventDefault`* | Default event flag | +| 0x01 |*`CU_EVENT_BLOCKING_SYNC`* |*`hipEventBlockingSync`* | Event uses blocking synchronization | +| 0x02 |*`CU_EVENT_DISABLE_TIMING`* |*`hipEventDisableTiming`* | Event will not record timing data | +| 0x04 |*`CU_EVENT_INTERPROCESS`* |*`hipEventInterprocess`* | Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set | +| enum |***`CUfilter_mode`*** |***`hipTextureFilterMode`*** | Texture reference filtering modes | +| 0 |*`CU_TR_FILTER_MODE_POINT`* |*`hipFilterModePoint`* | Point filter mode | +| 1 |*`CU_TR_FILTER_MODE_LINEAR`* |*`hipFilterModeLinear`* | Linear filter mode | +| enum |***`CUfunc_cache`*** |***`hipFuncCache`*** | Function cache configurations | +| 0x00 |*`CU_FUNC_CACHE_PREFER_NONE`* |*`hipFuncCachePreferNone`* | no preference for shared memory or L1 (default) | +| 0x01 |*`CU_FUNC_CACHE_PREFER_SHARED`* |*`hipFuncCachePreferShared`* | prefer larger shared memory and smaller L1 cache | +| 0x02 |*`CU_FUNC_CACHE_PREFER_L1`* |*`hipFuncCachePreferL1`* | prefer larger L1 cache and smaller shared memory | +| 0x03 |*`CU_FUNC_CACHE_PREFER_EQUAL`* |*`hipFuncCachePreferEqual`* | prefer equal sized L1 cache and shared memory | +| enum |***`CUfunction_attribute`*** | | Function properties | +| 0 |*`CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK`* | | The maximum number of threads per block, beyond which a launch of the function would fail. This number depends on both the function and the device on which the function is currently loaded. | +| 1 |*`CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES`* | | The size in bytes of statically-allocated shared memory required by this function. This does not include dynamically-allocated shared memory requested by the user at runtime. | +| 2 |*`CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES`* | | The size in bytes of user-allocated constant memory required by this function. | +| 3 |*`CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES`* | | The size in bytes of local memory used by each thread of this function. | +| 4 |*`CU_FUNC_ATTRIBUTE_NUM_REGS`* | | The number of registers used by each thread of this function. | +| 5 |*`CU_FUNC_ATTRIBUTE_PTX_VERSION`* | | The PTX virtual architecture version for which the function was compiled. This value is the major PTX version * 10 + the minor PTX version, so a PTX version 1.3 function would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA 3.0. | +| 6 |*`CU_FUNC_ATTRIBUTE_BINARY_VERSION`* | | The binary architecture version for which the function was compiled. This value is the major binary version * 10 + the minor binary version, so a binary version 1.3 function would return the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary architecture version. | +| 7 |*`CU_FUNC_ATTRIBUTE_CACHE_MODE_CA`* | | The attribute to indicate whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set. | +| 8 |*`CU_FUNC_ATTRIBUTE_MAX`* | | | +| enum |***`CUgraphicsMapResourceFlags`*** | | Flags for mapping and unmapping interop resources | +| 0x00 |*`CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE`* | | | +| 0x01 |*`CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY`* | | | +| 0x02 |*`CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD`* | | | +| enum |***`CUgraphicsRegisterFlags`*** | | Flags to register a graphics resource | +| 0x00 |*`CU_GRAPHICS_REGISTER_FLAGS_NONE`* | | | +| 0x01 |*`CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY`* | | | +| 0x02 |*`CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD`* | | | +| 0x04 |*`CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST`* | | | +| 0x08 |*`CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER`* | | | +| enum |***`CUipcMem_flags`*** | | CUDA Ipc Mem Flags | +| 0x1 |*`CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS`* |*`hipIpcMemLazyEnablePeerAccess`* | Automatically enable peer access between remote devices as needed | +| enum |***`CUjit_cacheMode`*** | | Caching modes for dlcm | +| 0 |*`CU_JIT_CACHE_OPTION_NONE`* | | Compile with no -dlcm flag specified | +| |*`CU_JIT_CACHE_OPTION_CG`* | | Compile with L1 cache disabled | +| |*`CU_JIT_CACHE_OPTION_CA`* | | Compile with L1 cache enabled | +| enum |***`CUjit_fallback`*** | | Cubin matching fallback strategies | +| 0 |*`CU_PREFER_PTX`* | | Prefer to compile ptx if exact binary match not found | +| |*`CU_PREFER_BINARY`* | | Prefer to fall back to compatible binary code if exact match not found | +| enum |***`CUjit_option`*** | | Online compiler and linker options | +| 0 |*`CU_JIT_MAX_REGISTERS`* | | Max number of registers that a thread may use. Option type: unsigned int Applies to: compiler only. | +| |*`CU_JIT_THREADS_PER_BLOCK`* | | IN: Specifies minimum number of threads per block to target compilation for OUT: Returns the number of threads the compiler actually targeted. This restricts the resource utilization fo the compiler (e.g. max registers) such that a block with the given number of threads should be able to launch based on register limitations. Note, this option does not currently take into account any other resource limitations, such as shared memory utilization. Cannot be combined with CU_JIT_TARGET. Option type: unsigned int Applies to: compiler only. | +| |*`CU_JIT_WALL_TIME`* | | Overwrites the option value with the total wall clock time, in milliseconds, spent in the compiler and linker Option type: float Applies to: compiler and linker. | +| |*`CU_JIT_INFO_LOG_BUFFER`* | | Pointer to a buffer in which to print any log messages that are informational in nature (the buffer size is specified via option CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) Option type: char * Applies to: compiler and linker. | +| |*`CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES`* | | IN: Log buffer size in bytes. Log messages will be capped at this size (including null terminator) OUT: Amount of log buffer filled with messages Option type: unsigned int Applies to: compiler and linker. | +| |*`CU_JIT_OPTIMIZATION_LEVEL`* | | Level of optimizations to apply to generated code (0 - 4), with 4 being the default and highest level of optimizations. Option type: unsigned int Applies to: compiler only. | +| |*`CU_JIT_TARGET_FROM_CUCONTEXT`* | | No option value required. Determines the target based on the current attached context (default) Option type: No option value needed Applies to: compiler and linker. | +| |*`CU_JIT_TARGET`* | | Target is chosen based on supplied CUjit_target. Cannot be combined with CU_JIT_THREADS_PER_BLOCK. Option type: unsigned int for enumerated type CUjit_target Applies to: compiler and linker. | +| |*`CU_JIT_FALLBACK_STRATEGY`* | | Specifies choice of fallback strategy if matching cubin is not found. Choice is based on supplied CUjit_fallback. This option cannot be used with cuLink* APIs as the linker requires exact matches. Option type: unsigned int for enumerated type CUjit_fallback Applies to: compiler only. | +| |*`CU_JIT_GENERATE_DEBUG_INFO`* | | Specifies whether to create debug information in output (-g) (0: false, default) Option type: int Applies to: compiler and linker. | +| |*`CU_JIT_LOG_VERBOSE`* | | Generate verbose log messages (0: false, default) Option type: int Applies to: compiler and linker. | +| |*`CU_JIT_GENERATE_LINE_INFO`* | | Generate line number information (-lineinfo) (0: false, default) Option type: int Applies to: compiler only. | +| |*`CU_JIT_CACHE_MODE`* | | Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied CUjit_cacheMode_enum. Option type: unsigned int for enumerated type CUjit_cacheMode_enum Applies to: compiler only. | +| |*`CU_JIT_NUM_OPTIONS`* | | | +| enum |***`CUjit_target`*** | | Online compilation targets | +| 10 |*`CU_TARGET_COMPUTE_10`* | | Compute device class 1.0. | +| 11 |*`CU_TARGET_COMPUTE_11`* | | Compute device class 1.1. | +| 12 |*`CU_TARGET_COMPUTE_12`* | | Compute device class 1.2. | +| 13 |*`CU_TARGET_COMPUTE_13`* | | Compute device class 1.3. | +| 20 |*`CU_TARGET_COMPUTE_20`* | | Compute device class 2.0. | +| 21 |*`CU_TARGET_COMPUTE_21`* | | Compute device class 2.1. | +| 30 |*`CU_TARGET_COMPUTE_30`* | | Compute device class 3.0. | +| 32 |*`CU_TARGET_COMPUTE_32`* | | Compute device class 3.2. | +| 35 |*`CU_TARGET_COMPUTE_35`* | | Compute device class 3.5. | +| 37 |*`CU_TARGET_COMPUTE_37`* | | Compute device class 3.7. | +| 50 |*`CU_TARGET_COMPUTE_50`* | | Compute device class 5.0. | +| 52 |*`CU_TARGET_COMPUTE_52`* | | Compute device class 5.2. | +| enum |***`CUjitInputType`*** | | Device code formats | +| 0 |*`CU_JIT_INPUT_CUBIN`* | | Compiled device-class-specific device code Applicable options: none. | +| |*`CU_JIT_INPUT_PTX`* | | PTX source code Applicable options: PTX compiler options. | +| |*`CU_JIT_INPUT_FATBINARY`* | | Bundle of multiple cubins and/or PTX of some device code Applicable options: PTX compiler options, CU_JIT_FALLBACK_STRATEGY. | +| |*`CU_JIT_INPUT_OBJECT`* | | Host object with embedded device code Applicable options: PTX compiler options, CU_JIT_FALLBACK_STRATEGY. | +| |*`CU_JIT_INPUT_LIBRARY`* | | Archive of host objects with embedded device code Applicable options: PTX compiler options, CU_JIT_FALLBACK_STRATEGY. | +| |*`CU_JIT_NUM_INPUT_TYPES`* | | | +| enum |***`CUlimit`*** |***`hipLimit_t`*** | Limits | +| 0x00 |*`CU_LIMIT_STACK_SIZE`* | | GPU thread stack size. | +| 0x01 |*`CU_LIMIT_PRINTF_FIFO_SIZE`* | | GPU printf FIFO size. | +| 0x02 |*`CU_LIMIT_MALLOC_HEAP_SIZE`* |*`hipLimitMallocHeapSize`* | GPU malloc heap size. | +| 0x03 |*`CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH`* | | GPU device runtime launch synchronize depth. | +| 0x04 |*`CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT`* | | GPU device runtime pending launch count. | +| |*`CU_LIMIT_MAX`* | | | +| enum |***`CUmemAttach_flags`*** | | CUDA Mem Attach Flags | +| 0x1 |*`CU_MEM_ATTACH_GLOBAL`* | | Memory can be accessed by any stream on any device. | +| 0x2 |*`CU_MEM_ATTACH_HOST`* | | Memory cannot be accessed by any stream on any device. | +| 0x4 |*`CU_MEM_ATTACH_SINGLE`* | | Memory can only be accessed by a single stream on the associated device. | +| enum |***`CUmemorytype`*** | | Memory types | +| 0x01 |*`CU_MEMORYTYPE_HOST`* | | Host memory | +| 0x02 |*`CU_MEMORYTYPE_DEVICE`* | | Device memory | +| 0x03 |*`CU_MEMORYTYPE_ARRAY`* | | Array memory | +| 0x04 |*`CU_MEMORYTYPE_UNIFIED`* | | Unified device or host memory | +| enum |***`CUoccupancy_flags`*** | | Occupancy calculator flag | +| 0x00 |*`CU_OCCUPANCY_DEFAULT`* | | Default behavior | +| 0x01 |*`CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE`* | | Assume global caching is enabled and cannot be automatically turned off | +| enum |***`CUpointer_attribute`*** | | Pointer information | +| 1 |*`CU_POINTER_ATTRIBUTE_CONTEXT`* | | The CUcontext on which a pointer was allocated or registered | +| 2 |*`CU_POINTER_ATTRIBUTE_MEMORY_TYPE`* | | The CUmemorytype describing the physical location of a pointer | +| 3 |*`CU_POINTER_ATTRIBUTE_DEVICE_POINTER`* | | The address at which a pointer's memory may be accessed on the device | +| 4 |*`CU_POINTER_ATTRIBUTE_HOST_POINTER`* | | The address at which a pointer's memory may be accessed on the host | +| 5 |*`CU_POINTER_ATTRIBUTE_P2P_TOKENS`* | | A pair of tokens for use with the nv-p2p.h Linux kernel interface | +| 6 |*`CU_POINTER_ATTRIBUTE_SYNC_MEMOPS`* | | Synchronize every synchronous memory operation initiated on this region | +| 7 |*`CU_POINTER_ATTRIBUTE_BUFFER_ID`* | | A process-wide unique ID for an allocated memory region | +| 8 |*`CU_POINTER_ATTRIBUTE_IS_MANAGED`* | | Indicates if the pointer points to managed memory | +| enum |***`CUmemorytype`*** | | Resource types | +| 0x00 |*`CU_RESOURCE_TYPE_ARRAY`* | | Array resoure | +| 0x01 |*`CU_RESOURCE_TYPE_MIPMAPPED_ARRAY`* | | Mipmapped array resource | +| 0x02 |*`CU_RESOURCE_TYPE_LINEAR`* | | Linear resource | +| 0x03 |*`CU_RESOURCE_TYPE_PITCH2D`* | | Pitch 2D resource | +| enum |***`CUresourceViewFormat`*** | | Resource view format | +| 0x00 |*`CU_RES_VIEW_FORMAT_NONE`* | | No resource view format (use underlying resource format) | +| 0x01 |*`CU_RES_VIEW_FORMAT_UINT_1X8`* | | 1 channel unsigned 8-bit integers | +| 0x02 |*`CU_RES_VIEW_FORMAT_UINT_2X8`* | | 2 channel unsigned 8-bit integers | +| 0x03 |*`CU_RES_VIEW_FORMAT_UINT_4X8`* | | 4 channel unsigned 8-bit integers | +| 0x04 |*`CU_RES_VIEW_FORMAT_SINT_1X8`* | | 1 channel signed 8-bit integers | +| 0x05 |*`CU_RES_VIEW_FORMAT_SINT_2X8`* | | 2 channel signed 8-bit integers | +| 0x06 |*`CU_RES_VIEW_FORMAT_SINT_4X8`* | | 4 channel signed 8-bit integers | +| 0x07 |*`CU_RES_VIEW_FORMAT_UINT_1X16`* | | 1 channel unsigned 16-bit integers | +| 0x08 |*`CU_RES_VIEW_FORMAT_UINT_2X16`* | | 2 channel unsigned 16-bit integers | +| 0x09 |*`CU_RES_VIEW_FORMAT_UINT_4X16`* | | 4 channel unsigned 16-bit integers | +| 0x0a |*`CU_RES_VIEW_FORMAT_SINT_1X16`* | | 1 channel signed 16-bit integers | +| 0x0b |*`CU_RES_VIEW_FORMAT_SINT_2X16`* | | 2 channel signed 16-bit integers | +| 0x0c |*`CU_RES_VIEW_FORMAT_SINT_4X16`* | | 4 channel signed 16-bit integers | +| 0x0d |*`CU_RES_VIEW_FORMAT_UINT_1X32`* | | 1 channel unsigned 32-bit integers | +| 0x0e |*`CU_RES_VIEW_FORMAT_UINT_2X32`* | | 2 channel unsigned 32-bit integers | +| 0x0f |*`CU_RES_VIEW_FORMAT_UINT_4X32`* | | 4 channel unsigned 32-bit integers | +| 0x10 |*`CU_RES_VIEW_FORMAT_SINT_1X32`* | | 1 channel signed 32-bit integers | +| 0x11 |*`CU_RES_VIEW_FORMAT_SINT_2X32`* | | 2 channel signed 32-bit integers | +| 0x12 |*`CU_RES_VIEW_FORMAT_SINT_4X32`* | | 4 channel signed 32-bit integers | +| 0x13 |*`CU_RES_VIEW_FORMAT_FLOAT_1X16`* | | 1 channel 16-bit floating point | +| 0x14 |*`CU_RES_VIEW_FORMAT_FLOAT_2X16`* | | 2 channel 16-bit floating point | +| 0x15 |*`CU_RES_VIEW_FORMAT_FLOAT_4X16`* | | 4 channel 16-bit floating point | +| 0x16 |*`CU_RES_VIEW_FORMAT_FLOAT_1X32`* | | 1 channel 32-bit floating point | +| 0x17 |*`CU_RES_VIEW_FORMAT_FLOAT_2X32`* | | 2 channel 32-bit floating point | +| 0x18 |*`CU_RES_VIEW_FORMAT_FLOAT_4X32`* | | 4 channel 32-bit floating point | +| 0x19 |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC1`* | | Block compressed 1 | +| 0x1a |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC3`* | | Block compressed 2 | +| 0x1b |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC3`* | | Block compressed 3 | +| 0x1c |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC4`* | | Block compressed 4 unsigned | +| 0x1d |*`CU_RES_VIEW_FORMAT_SIGNED_BC4`* | | Block compressed 4 signed | +| 0x1e |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC5`* | | Block compressed 5 unsigned | +| 0x1f |*`CU_RES_VIEW_FORMAT_SIGNED_BC5`* | | Block compressed 5 signed | +| 0x20 |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC6H`* | | Block compressed 6 unsigned half-float | +| 0x21 |*`CU_RES_VIEW_FORMAT_SIGNED_BC6H`* | | Block compressed 6 signed half-float | +| 0x22 |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC7`* | | Block compressed 7 | +| enum |***`CUresult`*** |***`hipError_t`*** | Error codes | +| 0 |*`CUDA_SUCCESS`* |*`hipSuccess`* | The API call returned with no errors. In the case of query calls, this can also mean that the operation being queried is complete (see cuEventQuery() and cuStreamQuery()). | +| 1 |*`CUDA_ERROR_INVALID_VALUE`* |*`hipErrorInvalidValue`* | This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values. | +| 2 |*`CUDA_ERROR_OUT_OF_MEMORY`* |*`hipErrorMemoryAllocation`* | The API call failed because it was unable to allocate enough memory to perform the requested operation. | +| 3 |*`CUDA_ERROR_NOT_INITIALIZED`* |*`hipErrorNotInitialized`* | This indicates that the CUDA driver has not been initialized with cuInit() or that initialization has failed. | +| 4 |*`CUDA_ERROR_DEINITIALIZED`* |*`hipErrorDeinitialized`* | This indicates that the CUDA driver is in the process of shutting down. | +| 5 |*`CUDA_ERROR_PROFILER_DISABLED`* |*`hipErrorProfilerDisabled`* | This indicates profiler is not initialized for this run. This can happen when the application is running with external profiling tools like visual profiler. | +| 6 |*`CUDA_ERROR_PROFILER_NOT_INITIALIZED`* |*`hipErrorProfilerNotInitialized`* | Deprecated This error return is deprecated as of CUDA 5.0. It is no longer an error to attempt to enable/disable the profiling via cuProfilerStart or cuProfilerStop without initialization. | +| 7 |*`CUDA_ERROR_PROFILER_ALREADY_STARTED`* |*`hipErrorProfilerAlreadyStarted`* | Deprecated This error return is deprecated as of CUDA 5.0. It is no longer an error to call cuProfilerStart() when profiling is already enabled. | +| 8 |*`CUDA_ERROR_PROFILER_ALREADY_STOPPED`* |*`hipErrorProfilerAlreadyStopped`* | Deprecated This error return is deprecated as of CUDA 5.0. It is no longer an error to call cuProfilerStop() when profiling is already disabled. | +| 100 |*`CUDA_ERROR_NO_DEVICE`* |*`hipErrorNoDevice`* | This indicates that no CUDA-capable devices were detected by the installed CUDA driver. | +| 101 |*`CUDA_ERROR_INVALID_DEVICE`* |*`hipErrorInvalidDevice`* | This indicates that the device ordinal supplied by the user does not correspond to a valid CUDA device. | +| 200 |*`CUDA_ERROR_INVALID_IMAGE`* |*`hipErrorInvalidImage`* | This indicates that the device kernel image is invalid. This can also indicate an invalid CUDA module. | +| 201 |*`CUDA_ERROR_INVALID_CONTEXT`* |*`hipErrorInvalidContext`* | This most frequently indicates that there is no context bound to the current thread. This can also be returned if the context passed to an API call is not a valid handle (such as a context that has had cuCtxDestroy() invoked on it). This can also be returned if a user mixes different API versions (i.e. 3010 context with 3020 API calls). See cuCtxGetApiVersion() for more details. | +| 202 |*`CUDA_ERROR_CONTEXT_ALREADY_CURRENT`* |*`hipErrorContextAlreadyCurrent`* | This indicated that the context being supplied as a parameter to the API call was already the active context. Deprecated This error return is deprecated as of CUDA 3.2. It is no longer an error to attempt to push the active context via cuCtxPushCurrent(). | +| 205 |*`CUDA_ERROR_MAP_FAILED`* |*`hipErrorMapFailed`* | This indicates that a map or register operation has failed. | +| 206 |*`CUDA_ERROR_UNMAP_FAILED`* |*`hipErrorUnmapFailed`* | This indicates that an unmap or unregister operation has failed. | +| 207 |*`CUDA_ERROR_ARRAY_IS_MAPPED`* |*`hipErrorArrayIsMapped`* | This indicates that the specified array is currently mapped and thus cannot be destroyed. | +| 208 |*`CUDA_ERROR_ALREADY_MAPPED`* |*`hipErrorAlreadyMapped`* | This indicates that the resource is already mapped. | +| 209 |*`CUDA_ERROR_NO_BINARY_FOR_GPU`* |*`hipErrorNoBinaryForGpu* | This indicates that there is no kernel image available that is suitable for the device. This can occur when a user specifies code generation options for a particular CUDA source file that do not include the corresponding device configuration. | +| 210 |*`CUDA_ERROR_ALREADY_ACQUIRED`* |*`hipErrorAlreadyAcquired* | This indicates that a resource has already been acquired. | +| 211 |*`CUDA_ERROR_NOT_MAPPED`* |*`hipErrorNotMapped`* | This indicates that a resource is not mapped. | +| 212 |*`CUDA_ERROR_NOT_MAPPED_AS_ARRAY`* |*`hipErrorNotMappedAsArray`* | This indicates that a mapped resource is not available for access as an array. | +| 213 |*`CUDA_ERROR_NOT_MAPPED_AS_POINTER`* |*`hipErrorNotMappedAsPointer`* | This indicates that a mapped resource is not available for access as a pointer. | +| 214 |*`CUDA_ERROR_ECC_UNCORRECTABLE`* |*`hipErrorECCNotCorrectable`* | This indicates that an uncorrectable ECC error was detected during execution. | +| 215 |*`CUDA_ERROR_UNSUPPORTED_LIMIT`* |*`hipErrorUnsupportedLimit`* | This indicates that the CUlimit passed to the API call is not supported by the active device. | +| 216 |*`CUDA_ERROR_CONTEXT_ALREADY_IN_USE`* |*`hipErrorContextAlreadyInUse`* | This indicates that the CUcontext passed to the API call can only be bound to a single CPU thread at a time but is already bound to a CPU thread. | +| 217 |*`CUDA_ERROR_PEER_ACCESS_UNSUPPORTED`* |*`hipErrorPeerAccessUnsupported`* | This indicates that peer access is not supported across the given devices. | +| 218 |*`CUDA_ERROR_INVALID_PTX`* |*`hipErrorInvalidKernelFile`* | This indicates that a PTX JIT compilation failed. | +| 219 |*`CUDA_ERROR_INVALID_GRAPHICS_CONTEXT`* |*`hipErrorInvalidGraphicsContext`* | This indicates an error with OpenGL or DirectX context. | +| 300 |*`CUDA_ERROR_INVALID_SOURCE`* |*`hipErrorInvalidSource`* | This indicates that the device kernel source is invalid. | +| 301 |*`CUDA_ERROR_FILE_NOT_FOUND`* |*`hipErrorFileNotFound`* | This indicates that the file specified was not found. | +| 302 |*`CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`* |*`hipErrorSharedObjectSymbolNotFound`* | This indicates that a link to a shared object failed to resolve. | +| 303 |*`CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`* |*`hipErrorSharedObjectInitFailed`* | This indicates that initialization of a shared object failed. | +| 304 |*`CUDA_ERROR_OPERATING_SYSTEM`* |*`hipErrorOperatingSystem`* | This indicates that an OS call failed. | +| 400 |*`CUDA_ERROR_INVALID_HANDLE`* |*`hipErrorInvalidResourceHandle`* | This indicates that a resource handle passed to the API call was not valid. Resource handles are opaque types like CUstream and CUevent. | +| 500 |*`CUDA_ERROR_NOT_FOUND`* |*`hipErrorNotFound`* | This indicates that a named symbol was not found. Examples of symbols are global/constant variable names, texture names, and surface names. | +| 600 |*`CUDA_ERROR_NOT_READY`* |*`hipErrorNotReady`* | This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than CUDA_SUCCESS (which indicates completion). Calls that may return this value include cuEventQuery() and cuStreamQuery(). | +| 700 |*`CUDA_ERROR_ILLEGAL_ADDRESS`* |*`hipErrorIllegalAddress`* | While executing a kernel, the device encountered a load or store instruction on an invalid memory address. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | + + +## **2. Error Handling** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **3. Initialization** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **4. Version Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **5. Device Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + +## **6. Device Management [DEPRECATED]** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **7. Primary Context Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **8. Context Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **9. Context Management [DEPRECATED]** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **10. Module Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **11. Memory Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **12. Unified Addressing** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **13. Stream Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **14. Event Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **15. Execution Control** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **16. Execution Control [DEPRECATED]** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **17. Occupancy** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **18. Texture Reference Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **19. Texture Reference Management [DEPRECATED]** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **20. Surface Reference Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **21. Texture Object Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **22. Surface Object Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **23. Peer Context Memory Access** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **24. Graphics Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **25. Profiler Control** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **26. OpenGL Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **27. Direct3D 9 Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **28. Direct3D 10 Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **29. Direct3D 11 Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **30. VDPAU Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 383af0440c..5a2940322e 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -81,6 +81,7 @@ enum ConvTypes { CONV_GL, CONV_GRAPHICS, CONV_SURFACE, + CONV_JIT, CONV_OTHER, CONV_INCLUDE, CONV_INCLUDE_CUDA_MAIN_H, @@ -94,7 +95,7 @@ const char *counterNames[CONV_LAST] = { "driver", "dev", "mem", "kern", "coord_func", "math_func", "special_func", "stream", "event", "occupancy", "ctx", "module", "cache", "exec", "err", "def", "tex", "gl", - "graphics", "surface", "other", "include", "include_cuda_main_header", + "graphics", "surface", "jit", "other", "include", "include_cuda_main_header", "type", "literal", "numeric_literal"}; enum ApiTypes { @@ -190,24 +191,23 @@ struct cuda2hipMap { // Error codes and return types cuda2hipRename["CUresult"] = {"hipError_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["cudaError_enum"] = {"hipError_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["cudaError_t"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; cuda2hipRename["cudaError"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; - // CUDA Driver API error code only - cuda2hipRename["CUDA_ERROR_INVALID_CONTEXT"] = {"hipErrorInvalidContext", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_CURRENT"] = {"hipErrorContextAlreadyCurrent", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_MAP_FAILED"] = {"hipErrorMapFailed", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_UNMAP_FAILED"] = {"hipErrorUnmapFailed", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_ARRAY_IS_MAPPED"] = {"hipErrorArrayIsMapped", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_ALREADY_MAPPED"] = {"hipErrorAlreadyMapped", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_ALREADY_ACQUIRED"] = {"hipErrorAlreadyAcquired", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_NOT_MAPPED"] = {"hipErrorNotMapped", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_ARRAY"] = {"hipErrorNotMappedAsArray", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_POINTER"] = {"hipErrorNotMappedAsPointer", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_IN_USE"] = {"hipErrorContextAlreadyInUse", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; + // CUDA Driver API error codes only + cuda2hipRename["CUDA_ERROR_INVALID_CONTEXT"] = {"hipErrorInvalidContext", CONV_ERR, API_DRIVER}; // 201 + cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_CURRENT"] = {"hipErrorContextAlreadyCurrent", CONV_ERR, API_DRIVER}; // 202 + cuda2hipRename["CUDA_ERROR_ARRAY_IS_MAPPED"] = {"hipErrorArrayIsMapped", CONV_ERR, API_DRIVER}; // 207 + cuda2hipRename["CUDA_ERROR_ALREADY_MAPPED"] = {"hipErrorAlreadyMapped", CONV_ERR, API_DRIVER}; // 208 + cuda2hipRename["CUDA_ERROR_ALREADY_ACQUIRED"] = {"hipErrorAlreadyAcquired", CONV_ERR, API_DRIVER}; // 210 + cuda2hipRename["CUDA_ERROR_NOT_MAPPED"] = {"hipErrorNotMapped", CONV_ERR, API_DRIVER}; // 211 + cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_ARRAY"] = {"hipErrorNotMappedAsArray", CONV_ERR, API_DRIVER}; // 212 + cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_POINTER"] = {"hipErrorNotMappedAsPointer", CONV_ERR, API_DRIVER}; // 213 + cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_IN_USE"] = {"hipErrorContextAlreadyInUse", CONV_ERR, API_DRIVER}; // 216 + cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 + cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 + cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 // CUDA RT API error code only cuda2hipRename["cudaErrorMissingConfiguration"] = {"hipErrorMissingConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 1 @@ -216,8 +216,6 @@ struct cuda2hipMap { cuda2hipRename["cudaErrorInvalidConfiguration"] = {"hipErrorInvalidConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 9 cuda2hipRename["cudaErrorInvalidPitchValue"] = {"hipErrorInvalidPitchValue", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 12 cuda2hipRename["cudaErrorInvalidSymbol"] = {"hipErrorInvalidSymbol", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 13 - cuda2hipRename["cudaErrorMapBufferObjectFailed"] = {"hipErrorMapBufferObjectFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 14 - cuda2hipRename["cudaErrorUnmapBufferObjectFailed"] = {"hipErrorUnmapBufferObjectFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 15 cuda2hipRename["cudaErrorInvalidHostPointer"] = {"hipErrorInvalidHostPointer", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 16 cuda2hipRename["cudaErrorInvalidDevicePointer"] = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; // 17 cuda2hipRename["cudaErrorInvalidTexture"] = {"hipErrorInvalidTexture", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 18 @@ -262,17 +260,96 @@ struct cuda2hipMap { // Deprecated as of CUDA 4.1 cuda2hipRename["cudaErrorApiFailureBase"] = {"hipErrorApiFailureBase", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 10000 - cuda2hipRename["CUDA_SUCCESS"] = {"hipSuccess", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaSuccess"] = {"hipSuccess", CONV_ERR, API_RUNTIME}; // 0 - cuda2hipRename["CUDA_ERROR_OUT_OF_MEMORY"] = {"hipErrorMemoryAllocation", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorMemoryAllocation"] = {"hipErrorMemoryAllocation", CONV_ERR, API_RUNTIME}; // 2 - cuda2hipRename["CUDA_ERROR_NOT_INITIALIZED"] = {"hipErrorNotInitialized", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInitializationError", CONV_ERR, API_RUNTIME}; // 3 + cuda2hipRename["CUDA_SUCCESS"] = {"hipSuccess", CONV_ERR, API_DRIVER}; // 0 + cuda2hipRename["cudaSuccess"] = {"hipSuccess", CONV_ERR, API_RUNTIME}; // 0 - cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"] = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorLaunchFailure"] = {"hipErrorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 4 + cuda2hipRename["CUDA_ERROR_INVALID_VALUE"] = {"hipErrorInvalidValue", CONV_ERR, API_DRIVER}; // 1 + cuda2hipRename["cudaErrorInvalidValue"] = {"hipErrorInvalidValue", CONV_ERR, API_RUNTIME}; // 11 + + cuda2hipRename["CUDA_ERROR_OUT_OF_MEMORY"] = {"hipErrorMemoryAllocation", CONV_ERR, API_DRIVER}; // 2 + cuda2hipRename["cudaErrorMemoryAllocation"] = {"hipErrorMemoryAllocation", CONV_ERR, API_RUNTIME}; // 2 + + cuda2hipRename["CUDA_ERROR_NOT_INITIALIZED"] = {"hipErrorNotInitialized", CONV_ERR, API_DRIVER}; // 3 + cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInitializationError", CONV_ERR, API_RUNTIME}; // 3 + + cuda2hipRename["CUDA_ERROR_DEINITIALIZED"] = {"hipErrorDeinitialized", CONV_ERR, API_DRIVER}; // 4 + // TODO: double check, that these errors match + cuda2hipRename["cudaErrorCudartUnloading"] = {"hipErrorDeinitialized", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 29 + + cuda2hipRename["CUDA_ERROR_PROFILER_DISABLED"] = {"hipErrorProfilerDisabled", CONV_ERR, API_DRIVER}; // 5 + cuda2hipRename["cudaErrorProfilerDisabled"] = {"hipErrorProfilerDisabled", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 55 + + cuda2hipRename["CUDA_ERROR_PROFILER_NOT_INITIALIZED"] = {"hipErrorProfilerNotInitialized", CONV_ERR, API_DRIVER}; // 6 + // Deprecated as of CUDA 5.0 + cuda2hipRename["cudaErrorProfilerNotInitialized"] = {"hipErrorProfilerNotInitialized", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 56 + + cuda2hipRename["CUDA_ERROR_PROFILER_ALREADY_STARTED"] = {"hipErrorProfilerAlreadyStarted", CONV_ERR, API_DRIVER}; // 7 + // Deprecated as of CUDA 5.0 + cuda2hipRename["cudaErrorProfilerAlreadyStarted"] = {"hipErrorProfilerAlreadyStarted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 57 + + cuda2hipRename["CUDA_ERROR_PROFILER_ALREADY_STOPPED"] = {"hipErrorProfilerAlreadyStopped", CONV_ERR, API_DRIVER}; // 8 + // Deprecated as of CUDA 5.0 + cuda2hipRename["cudaErrorProfilerAlreadyStopped"] = {"hipErrorProfilerAlreadyStopped", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 58 + + cuda2hipRename["CUDA_ERROR_NO_DEVICE"] = {"hipErrorNoDevice", CONV_ERR, API_DRIVER}; // 100 + cuda2hipRename["cudaErrorNoDevice"] = {"hipErrorNoDevice", CONV_ERR, API_RUNTIME}; // 38 + + cuda2hipRename["CUDA_ERROR_INVALID_DEVICE"] = {"hipErrorInvalidDevice", CONV_ERR, API_DRIVER}; // 101 + cuda2hipRename["cudaErrorInvalidDevice"] = {"hipErrorInvalidDevice", CONV_ERR, API_RUNTIME}; // 10 + + cuda2hipRename["CUDA_ERROR_INVALID_IMAGE"] = {"hipErrorInvalidImage", CONV_ERR, API_DRIVER}; // 200 + cuda2hipRename["cudaErrorInvalidKernelImage"] = {"hipErrorInvalidImage", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 47 + + cuda2hipRename["CUDA_ERROR_MAP_FAILED"] = {"hipErrorMapFailed", CONV_ERR, API_DRIVER}; // 205 + // TODO: double check, that these errors match + cuda2hipRename["cudaErrorMapBufferObjectFailed"] = {"hipErrorMapFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 14 + + cuda2hipRename["CUDA_ERROR_UNMAP_FAILED"] = {"hipErrorUnmapFailed", CONV_ERR, API_DRIVER}; // 206 + // TODO: double check, that these errors match + cuda2hipRename["cudaErrorUnmapBufferObjectFailed"] = {"hipErrorUnmapFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 15 + + cuda2hipRename["CUDA_ERROR_NO_BINARY_FOR_GPU"] = {"hipErrorNoBinaryForGpu", CONV_ERR, API_DRIVER}; // 209 + cuda2hipRename["cudaErrorNoKernelImageForDevice"] = {"hipErrorNoBinaryForGpu", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 48 + + cuda2hipRename["CUDA_ERROR_ECC_UNCORRECTABLE"] = {"hipErrorECCNotCorrectable", CONV_ERR, API_DRIVER}; // 214 + cuda2hipRename["cudaErrorECCUncorrectable"] = {"hipErrorECCNotCorrectable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 39 + + cuda2hipRename["CUDA_ERROR_UNSUPPORTED_LIMIT"] = {"hipErrorUnsupportedLimit", CONV_ERR, API_DRIVER}; // 215 + cuda2hipRename["cudaErrorUnsupportedLimit"] = {"hipErrorUnsupportedLimit", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 42 + + cuda2hipRename["CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"] = {"hipErrorPeerAccessUnsupported", CONV_ERR, API_DRIVER}; // 217 + cuda2hipRename["cudaErrorPeerAccessUnsupported"] = {"hipErrorPeerAccessUnsupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 64 + + cuda2hipRename["CUDA_ERROR_INVALID_PTX"] = {"hipErrorInvalidKernelFile", CONV_ERR, API_DRIVER}; // 218 + cuda2hipRename["cudaErrorInvalidPtx"] = {"hipErrorInvalidKernelFile", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 78 + + cuda2hipRename["CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_DRIVER}; // 219 + cuda2hipRename["cudaErrorInvalidGraphicsContext"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 79 + + cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_DRIVER}; // 302 + cuda2hipRename["cudaErrorSharedObjectSymbolNotFound"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 40 + + cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"] = {"hipErrorSharedObjectInitFailed", CONV_ERR, API_DRIVER}; // 303 + cuda2hipRename["cudaErrorSharedObjectInitFailed"] = {"hipErrorSharedObjectInitFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 41 + + cuda2hipRename["CUDA_ERROR_OPERATING_SYSTEM"] = {"hipErrorOperatingSystem", CONV_ERR, API_DRIVER}; // 304 + cuda2hipRename["cudaErrorOperatingSystem"] = {"hipErrorOperatingSystem", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 63 + + cuda2hipRename["CUDA_ERROR_INVALID_HANDLE"] = {"hipErrorInvalidResourceHandle", CONV_ERR, API_DRIVER}; // 400 + cuda2hipRename["cudaErrorInvalidResourceHandle"] = {"hipErrorInvalidResourceHandle", CONV_ERR, API_RUNTIME}; // 33 + + cuda2hipRename["CUDA_ERROR_NOT_READY"] = {"hipErrorNotReady", CONV_ERR, API_DRIVER}; // 600 + cuda2hipRename["cudaErrorNotReady"] = {"hipErrorNotReady", CONV_ERR, API_RUNTIME}; // 34 + + cuda2hipRename["CUDA_ERROR_ILLEGAL_ADDRESS"] = {"hipErrorIllegalAddress", CONV_ERR, API_DRIVER}; // 700 + cuda2hipRename["cudaErrorIllegalAddress"] = {"hipErrorIllegalAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 77 + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"] = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER}; // 719 + cuda2hipRename["cudaErrorLaunchFailure"] = {"hipErrorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 4 cuda2hipRename["CUDA_ERROR_LAUNCH_TIMEOUT"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorLaunchTimeout"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 6 @@ -280,94 +357,81 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorLaunchOutOfResources"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_RUNTIME}; // 7 - cuda2hipRename["CUDA_ERROR_INVALID_DEVICE"] = {"hipErrorInvalidDevice", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidDevice"] = {"hipErrorInvalidDevice", CONV_ERR, API_RUNTIME}; // 10 - - cuda2hipRename["CUDA_ERROR_INVALID_VALUE"] = {"hipErrorInvalidValue", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidValue"] = {"hipErrorInvalidValue", CONV_ERR, API_RUNTIME}; // 11 - - cuda2hipRename["CUDA_ERROR_DEINITIALIZED"] = {"hipErrorDeinitialized", CONV_ERR, API_DRIVER}; - // TODO: double check, that this error matches to hipErrorDeinitialized - cuda2hipRename["cudaErrorCudartUnloading"] = {"hipErrorDeinitialized", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 29 - cuda2hipRename["CUDA_ERROR_UNKNOWN"] = {"hipErrorUnknown", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30 - cuda2hipRename["CUDA_ERROR_INVALID_HANDLE"] = {"hipErrorInvalidResourceHandle", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidResourceHandle"] = {"hipErrorInvalidResourceHandle", CONV_ERR, API_RUNTIME}; // 33 - // cuda2hipRename["CUDA_ERROR_NOT_INITIALIZED"] = {"hipErrorInitializationError", CONV_ERR, API_DRIVER}; // cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInitializationError", CONV_ERR, API_RUNTIME}; - cuda2hipRename["CUDA_ERROR_NOT_READY"] = {"hipErrorNotReady", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorNotReady"] = {"hipErrorNotReady", CONV_ERR, API_RUNTIME}; // 34 - - cuda2hipRename["CUDA_ERROR_NO_DEVICE"] = {"hipErrorNoDevice", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorNoDevice"] = {"hipErrorNoDevice", CONV_ERR, API_RUNTIME}; // 38 - - cuda2hipRename["CUDA_ERROR_ECC_UNCORRECTABLE"] = {"hipErrorECCNotCorrectable", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorECCUncorrectable"] = {"hipErrorECCNotCorrectable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 39 - - cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorSharedObjectSymbolNotFound"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 40 - - cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"] = {"hipErrorSharedObjectInitFailed", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorSharedObjectInitFailed"] = {"hipErrorSharedObjectInitFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 41 - - cuda2hipRename["CUDA_ERROR_UNSUPPORTED_LIMIT"] = {"hipErrorUnsupportedLimit", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorUnsupportedLimit"] = {"hipErrorUnsupportedLimit", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 42 - - cuda2hipRename["CUDA_ERROR_INVALID_IMAGE"] = {"hipErrorInvalidImage", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidKernelImage"] = {"hipErrorInvalidImage", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 47 - - cuda2hipRename["CUDA_ERROR_NO_BINARY_FOR_GPU"] = {"hipErrorNoBinaryForGpu", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorNoKernelImageForDevice"] = {"hipErrorNoBinaryForGpu", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 48 - cuda2hipRename["CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME}; // 50 cuda2hipRename["CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorPeerAccessNotEnabled"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME}; // 51 - cuda2hipRename["CUDA_ERROR_PROFILER_DISABLED"] = {"hipErrorProfilerDisabled", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorProfilerDisabled"] = {"hipErrorProfilerDisabled", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 55 - - cuda2hipRename["CUDA_ERROR_PROFILER_NOT_INITIALIZED"] = {"hipErrorProfilerNotInitialized", CONV_ERR, API_DRIVER}; - // Deprecated as of CUDA 5.0 - cuda2hipRename["cudaErrorProfilerNotInitialized"] = {"hipErrorProfilerNotInitialized", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 56 - - cuda2hipRename["CUDA_ERROR_PROFILER_ALREADY_STARTED"] = {"hipErrorProfilerAlreadyStarted", CONV_ERR, API_DRIVER}; - // Deprecated as of CUDA 5.0 - cuda2hipRename["cudaErrorProfilerAlreadyStarted"] = {"hipErrorProfilerAlreadyStarted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 57 - - cuda2hipRename["CUDA_ERROR_PROFILER_ALREADY_STOPPED"] = {"hipErrorProfilerAlreadyStopped", CONV_ERR, API_DRIVER}; - // Deprecated as of CUDA 5.0 - cuda2hipRename["cudaErrorProfilerAlreadyStopped"] = {"hipErrorProfilerAlreadyStopped", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 58 - cuda2hipRename["CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME}; // 61 cuda2hipRename["CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorHostMemoryNotRegistered"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME}; // 62 - cuda2hipRename["CUDA_ERROR_OPERATING_SYSTEM"] = {"hipErrorOperatingSystem", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorOperatingSystem"] = {"hipErrorOperatingSystem", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 63 - - cuda2hipRename["CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"] = {"hipErrorPeerAccessUnsupported", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorPeerAccessUnsupported"] = {"hipErrorPeerAccessUnsupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 64 - - cuda2hipRename["CUDA_ERROR_ILLEGAL_ADDRESS"] = {"hipErrorIllegalAddress", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorIllegalAddress"] = {"hipErrorIllegalAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 77 - - cuda2hipRename["CUDA_ERROR_INVALID_PTX"] = {"hipErrorInvalidKernelFile", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidPtx"] = {"hipErrorInvalidKernelFile", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 78 - - cuda2hipRename["CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidGraphicsContext"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 79 - - - ///////////////////////////// CUDA DRIVER API ///////////////////////////// + // enums + cuda2hipRename["CUDA_ARRAY3D_DESCRIPTOR"] = {"HIP_ARRAY3D_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_ARRAY_DESCRIPTOR"] = {"HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_MEMCPY2D"] = {"HIP_MEMCPY2D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_MEMCPY3D"] = {"HIP_MEMCPY3D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_MEMCPY3D_PEER"] = {"HIP_MEMCPY3D_PEER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_POINTER_ATTRIBUTE_P2P_TOKENS"] = {"HIP_POINTER_ATTRIBUTE_P2P_TOKENS", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_RESOURCE_DESC"] = {"HIP_RESOURCE_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_RESOURCE_VIEW_DESC"] = {"HIP_RESOURCE_VIEW_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + + cuda2hipRename["CUipcEventHandle"] = {"hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUipcMemHandle"] = {"hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + + + + cuda2hipRename["CUaddress_mode"] = {"hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TR_ADDRESS_MODE_WRAP"] = {"HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0 + cuda2hipRename["CU_TR_ADDRESS_MODE_CLAMP"] = {"HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_TR_ADDRESS_MODE_MIRROR"] = {"HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_TR_ADDRESS_MODE_BORDER"] = {"HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + + cuda2hipRename["CUarray_cubemap_face"] = {"hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_X"] = {"HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_X"] = {"HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Y"] = {"HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Y"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Z"] = {"HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Z"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x05 + + cuda2hipRename["CUarray_format"] = {"hipArray_format", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT8"] = {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT16"] = {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT32"] = {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT8"] = {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT16"] = {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x09 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT32"] = {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a + cuda2hipRename["CU_AD_FORMAT_HALF"] = {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_AD_FORMAT_FLOAT"] = {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 + // Compute mode + cuda2hipRename["CUcomputemode"] = {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) + cuda2hipRename["CU_COMPUTEMODE_DEFAULT"] = {"hipComputeModeDefault", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0 + cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 + // Context flags + cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 + cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f + // Defines cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_SIZE"] = {"HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_DEV, API_DRIVER}; @@ -375,104 +439,108 @@ struct cuda2hipMap { // Types // NOTE: CUdevice might be changed to typedef int in the future. - cuda2hipRename["CUdevice"] = {"hipDevice_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; - - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER}; + cuda2hipRename["CUdevice"] = {"hipDevice_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) // unsupported yet by HIP - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimY = 3) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimZ = 4) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimX =5) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimY = 6) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimZ - 7) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerBlock = 8) + // Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 9 // API_Runtime ANALOGUE (cudaDevAttrTotalConstantMemory = 9) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 10 // API_Runtime ANALOGUE (cudaDevAttrWarpSize = 10) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 11 // API_Runtime ANALOGUE (cudaDevAttrMaxPitch = 11) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 12) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 13 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 13) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 14 // API_Runtime ANALOGUE (cudaDevAttrTextureAlignment = 14) // Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 15 // API_Runtime ANALOGUE (cudaDevAttrGpuOverlap = 15) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 16 // API_Runtime ANALOGUE (cudaDevAttrMultiProcessorCount = 16) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 17 // API_Runtime ANALOGUE (cudaDevAttrKernelExecTimeout = 17) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 18 // API_Runtime ANALOGUE (cudaDevAttrIntegrated = 18) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 19 // API_Runtime ANALOGUE (cudaDevAttrCanMapHostMemory = 19) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER}; // 20 // API_Runtime ANALOGUE (cudaDevAttrComputeMode = 20) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 21 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DWidth = 21) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 22 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DWidth = 22) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 23 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DHeight = 23) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 24 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidth = 24) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 25 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeight = 25) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 26 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepth = 26) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) + // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) + // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) + // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 30 // API_Runtime ANALOGUE (cudaDevAttrSurfaceAlignment = 30) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER}; // 31 // API_Runtime ANALOGUE (cudaDevAttrConcurrentKernels = 31) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 32 // API_Runtime ANALOGUE (cudaDevAttrEccEnabled = 32) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER}; // 33 // API_Runtime ANALOGUE (cudaDevAttrPciBusId = 33) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER}; // 34 // API_Runtime ANALOGUE (cudaDevAttrPciDeviceId = 34) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 35 // API_Runtime ANALOGUE (cudaDevAttrTccDriver = 35) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 36 // API_Runtime ANALOGUE (cudaDevAttrMemoryClockRate = 36) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER}; // 37 // API_Runtime ANALOGUE (cudaDevAttrGlobalMemoryBusWidth = 37) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER}; // 38 // API_Runtime ANALOGUE (cudaDevAttrL2CacheSize = 38) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER}; // 39 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerMultiProcessor = 39) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 40 // API_Runtime ANALOGUE (cudaDevAttrAsyncEngineCount = 40) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 41 // API_Runtime ANALOGUE (cudaDevAttrUnifiedAddressing = 41) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 42 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredWidth = 42) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 43 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredLayers = 43) // deprecated, do not use - // cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 44 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 45 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherWidth = 45) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 46 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherHeight = 46) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 47 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidthAlt = 47) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 48 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeightAlt = 48) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 49 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepthAlt = 49) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 50 // API_Runtime ANALOGUE (cudaDevAttrPciDomainId = 50) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 51 // API_Runtime ANALOGUE (cudaDevAttrTexturePitchAlignment = 51) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 52 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapWidth = 52) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 53 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredWidth = 53) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 54 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredLayers = 54) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 55 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DWidth = 55) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 56 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DWidth = 56) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 57 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DHeight = 57) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 58 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DWidth = 58) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 59 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DHeight = 59) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 60 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DDepth = 60) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 61 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredWidth = 61) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 62 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredLayers = 62) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 63 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredWidth = 63) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredHeight = 64) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 65 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredLayers = 65) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 66 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapWidth = 66) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 67 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 68 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 69 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLinearWidth = 69) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 70 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearWidth = 70) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 71 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearHeight = 71) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 72 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearPitch = 72) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 73 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedWidth = 73) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 74 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedHeight = 74) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER}; // 75 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMajor = 75) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER}; // 76 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMinor = 76) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 77 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DMipmappedWidth = 77) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 78 // API_Runtime ANALOGUE (cudaDevAttrStreamPrioritiesSupported = 78) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 79 // API_Runtime ANALOGUE (cudaDevAttrGlobalL1CacheSupported = 79) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 80 // API_Runtime ANALOGUE (cudaDevAttrLocalL1CacheSupported = 80) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER}; // 81 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 82 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerMultiprocessor = 82) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 83 // API_Runtime ANALOGUE (cudaDevAttrManagedMemory = 83) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER}; // 84 // API_Runtime ANALOGUE (cudaDevAttrIsMultiGpuBoard = 84) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 85 // API_Runtime ANALOGUE (cudaDevAttrMultiGpuBoardGroupID = 85) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 86 // API_Runtime ANALOGUE (no) + // unsupported yet by HIP [CUDA 8.0.44] cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; @@ -481,52 +549,232 @@ struct cuda2hipMap { cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CUdevprop_st"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUdevprop"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUdevprop_st"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUdevprop"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; // TODO: Analogues enum is needed in HIP. Couldn't map enum to struct hipPointerAttribute_t. // TODO: Do for Pointer Attributes the same as for Device Attributes. - // cuda2hipRename["CUpointer_attribute_enum"] = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER}; - // cuda2hipRename["CUpointer_attribute"] = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER}; + // cuda2hipRename["CUpointer_attribute_enum"] = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + // cuda2hipRename["CUpointer_attribute"] = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_CONTEXT"] = {"hipPointerAttributeContext", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_MEMORY_TYPE"] = {"hipPointerAttributeMemoryType", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_DEVICE_POINTER"] = {"hipPointerAttributeDevicePointer", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_HOST_POINTER"] = {"hipPointerAttributeHostPointer", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_P2P_TOKENS"] = {"hipPointerAttributeP2pTokens", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_SYNC_MEMOPS"] = {"hipPointerAttributeSyncMemops", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_BUFFER_ID"] = {"hipPointerAttributeBufferId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_IS_MANAGED"] = {"hipPointerAttributeIsManaged", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (no) + // pointer to CUfunc_st - cuda2hipRename["CUfunction"] = {"hipFunction_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUfunction"] = {"hipFunction_t", CONV_TYPE, API_DRIVER}; // TODO: in HIP ihipModuleSymbol_t should be declared in hip_runtime_api.h, not in hcc_detail/hip_runtime_api.h, as it's analogue CUfunc_st is declared also in cuda.h // ToDO: examples are needed with CUfunc_st - // cuda2hipRename["CUfunc_st"] = {"ihipModuleSymbol_t", CONV_TYPE, API_DRIVER}; + // cuda2hipRename["CUfunc_st"] = {"ihipModuleSymbol_t", CONV_TYPE, API_DRIVER}; // unsupported yet by HIP - cuda2hipRename["CUfunction_attribute_enum"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CUfunction_attribute"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUfunction_attribute"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUfunction_attribute_enum"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipFuncAttributeMaxThreadsPerBlocks", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES"] = {"hipFuncAttributeSharedSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES"] = {"hipFuncAttributeConstSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES"] = {"hipFuncAttributeLocalSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_NUM_REGS"] = {"hipFuncAttributeNumRegs", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_PTX_VERSION"] = {"hipFuncAttributePtxVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_BINARY_VERSION"] = {"hipFuncAttributeBinaryVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_CACHE_MODE_CA"] = {"hipFuncAttributeCacheModeCA", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_MAX"] = {"hipFuncAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CUfunc_cache_enum"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUfunc_cache"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CU_FUNC_CACHE_PREFER_NONE"] = {"hipFuncCachePreferNone", CONV_CACHE, API_DRIVER}; - cuda2hipRename["CU_FUNC_CACHE_PREFER_SHARED"] = {"hipFuncCachePreferShared", CONV_CACHE, API_DRIVER}; - cuda2hipRename["CU_FUNC_CACHE_PREFER_L1"] = {"hipFuncCachePreferL1", CONV_CACHE, API_DRIVER}; - cuda2hipRename["CU_FUNC_CACHE_PREFER_EQUAL"] = {"hipFuncCachePreferEqual", CONV_CACHE, API_DRIVER}; + // enum CUgraphicsMapResourceFlags/CUgraphicsMapResourceFlags_enum + cuda2hipRename["CUgraphicsMapResourceFlags"] = {"hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaGraphicsMapFlags) + cuda2hipRename["CUgraphicsMapResourceFlags_enum"] = {"hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaGraphicsMapFlags) + cuda2hipRename["CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE"] = {"hipGraphicsMapFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaGraphicsMapFlagsNone = 0) + cuda2hipRename["CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY"] = {"hipGraphicsMapFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaGraphicsMapFlagsReadOnly = 1) + cuda2hipRename["CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD"] = {"hipGraphicsMapFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaGraphicsMapFlagsWriteDiscard = 2) - cuda2hipRename["CUsharedconfig_enum"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUsharedconfig"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE"] = {"hipSharedMemBankSizeDefault", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_DRIVER}; + // enum CUgraphicsRegisterFlags/CUgraphicsRegisterFlags_enum + cuda2hipRename["CUgraphicsRegisterFlags"] = {"hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaGraphicsRegisterFlags) + cuda2hipRename["CUgraphicsRegisterFlags_enum"] = {"hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaGraphicsRegisterFlags) + cuda2hipRename["CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE"] = {"hipGraphicsRegisterFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaGraphicsRegisterFlagsNone = 0) + cuda2hipRename["CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY"] = {"hipGraphicsRegisterFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaGraphicsRegisterFlagsReadOnly = 1) + cuda2hipRename["CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD"] = {"hipGraphicsRegisterFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaGraphicsRegisterFlagsWriteDiscard = 2) + cuda2hipRename["CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST"] = {"hipGraphicsRegisterFlagsSurfaceLoadStore", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 // API_Runtime ANALOGUE (cudaGraphicsRegisterFlagsSurfaceLoadStore = 4) + cuda2hipRename["CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER"] = {"hipGraphicsRegisterFlagsTextureGather", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 // API_Runtime ANALOGUE (cudaGraphicsRegisterFlagsTextureGather = 8) - cuda2hipRename["CUcontext"] = {"hipCtx_t", CONV_TYPE, API_DRIVER}; + // enum CUoccupancy_flags/CUoccupancy_flags_enum + cuda2hipRename["CUoccupancy_flags"] = {"hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUoccupancy_flags_enum"] = {"hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_OCCUPANCY_DEFAULT"] = {"hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaOccupancyDefault = 0x0) + cuda2hipRename["CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE"] = {"hipOccupancyDisableCachingOverride", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaOccupancyDisableCachingOverride = 0x1) + + + + cuda2hipRename["CUfunc_cache_enum"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaFuncCache) + cuda2hipRename["CUfunc_cache"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaFuncCache) + cuda2hipRename["CU_FUNC_CACHE_PREFER_NONE"] = {"hipFuncCachePreferNone", CONV_CACHE, API_DRIVER}; // 0x00 // API_Runtime ANALOGUE (cudaFilterModePoint = 0) + cuda2hipRename["CU_FUNC_CACHE_PREFER_SHARED"] = {"hipFuncCachePreferShared", CONV_CACHE, API_DRIVER}; // 0x01 // API_Runtime ANALOGUE (cudaFuncCachePreferShared = 1) + cuda2hipRename["CU_FUNC_CACHE_PREFER_L1"] = {"hipFuncCachePreferL1", CONV_CACHE, API_DRIVER}; // 0x02 // API_Runtime ANALOGUE (cudaFuncCachePreferL1 = 2) + cuda2hipRename["CU_FUNC_CACHE_PREFER_EQUAL"] = {"hipFuncCachePreferEqual", CONV_CACHE, API_DRIVER}; // 0x03 // API_Runtime ANALOGUE (cudaFuncCachePreferEqual = 3) + + // enum CUipcMem_flags/CUipcMem_flags_enum + cuda2hipRename["CUipcMem_flags"] = {"hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUipcMem_flags_enum"] = {"hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS"] = {"hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1 // API_Runtime ANALOGUE (cudaIpcMemLazyEnablePeerAccess = 0x01) + + // enum CUipcMem_flags/CUipcMem_flags_enum + cuda2hipRename["CUipcMem_flags"] = {"hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + + // JIT + // enum CUjit_cacheMode/CUjit_cacheMode_enum + cuda2hipRename["CUjit_cacheMode"] = {"hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_cacheMode_enum"] = {"hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_CACHE_OPTION_NONE"] = {"hipJitCacheModeOptionNone", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_CACHE_OPTION_CG"] = {"hipJitCacheModeOptionCG", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_CACHE_OPTION_CA"] = {"hipJitCacheModeOptionCA", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // enum CUjit_fallback/CUjit_fallback_enum + cuda2hipRename["CUjit_fallback"] = {"hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_fallback_enum"] = {"hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_PREFER_PTX"] = {"hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_PREFER_BINARY"] = {"hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // enum CUjit_option/CUjit_option_enum + cuda2hipRename["CUjit_option"] = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_option_enum"] = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_MAX_REGISTERS"] = {"hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_THREADS_PER_BLOCK"] = {"hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_WALL_TIME"] = {"hipJitOptionWallTime", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INFO_LOG_BUFFER"] = {"hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER"] = {"hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_OPTIMIZATION_LEVEL"] = {"hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_TARGET_FROM_CUCONTEXT"] = {"hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_TARGET"] = {"hipJitOptionTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_FALLBACK_STRATEGY"] = {"hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_GENERATE_DEBUG_INFO"] = {"hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // enum CUjit_target/CUjit_target_enum + cuda2hipRename["CUjit_target"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_target_enum"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_10"] = {"hipJitTargetCompute10", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_11"] = {"hipJitTargetCompute11", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_12"] = {"hipJitTargetCompute12", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_13"] = {"hipJitTargetCompute13", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_20"] = {"hipJitTargetCompute20", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_21"] = {"hipJitTargetCompute21", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_30"] = {"hipJitTargetCompute30", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_32"] = {"hipJitTargetCompute32", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_35"] = {"hipJitTargetCompute35", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_37"] = {"hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_50"] = {"hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_52"] = {"hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // enum CUjitInputType/CUjitInputType_enum + cuda2hipRename["CUjitInputType"] = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjitInputType_enum"] = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INPUT_CUBIN"] = {"hipJitInputTypeBin", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INPUT_PTX"] = {"hipJitInputTypePtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INPUT_FATBINARY"] = {"hipJitInputTypeFatBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INPUT_OBJECT"] = {"hipJitInputTypeObject", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INPUT_LIBRARY"] = {"hipJitInputTypeLibrary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_NUM_INPUT_TYPES"] = {"hipJitInputTypeNumInputTypes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + + // Limits + cuda2hipRename["CUlimit"] = {"hipLimit_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaLimit) + cuda2hipRename["CUlimit_enum"] = {"hipLimit_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaLimit) + cuda2hipRename["CU_LIMIT_STACK_SIZE"] = {"hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaLimitStackSize = 0x00) + cuda2hipRename["CU_LIMIT_PRINTF_FIFO_SIZE"] = {"hipLimitPrintfFifoSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaLimitPrintfFifoSize = 0x01) + cuda2hipRename["CU_LIMIT_MALLOC_HEAP_SIZE"] = {"hipLimitMallocHeapSize", CONV_TYPE, API_DRIVER}; // 0x02 // API_Runtime ANALOGUE (cudaLimitMallocHeapSize = 0x02) + cuda2hipRename["CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH"] = {"hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaLimitDevRuntimeSyncDepth = 0x03) + cuda2hipRename["CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT"] = {"hipLimitDevRuntimePendingLaunchCount", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 // API_Runtime ANALOGUE (cudaLimitDevRuntimePendingLaunchCount = 0x04) + cuda2hipRename["CU_LIMIT_STACK_SIZE"] = {"hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + + // enum CUmemAttach_flags/CUmemAttach_flags_enum + cuda2hipRename["CUmemAttach_flags"] = {"hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUmemAttach_flags_enum"] = {"hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_MEM_ATTACH_GLOBAL"] = {"hipMemAttachGlobal", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1 // API_Runtime ANALOGUE (#define cudaMemAttachGlobal 0x01) + cuda2hipRename["CU_MEM_ATTACH_HOST"] = {"hipMemAttachHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x2 // API_Runtime ANALOGUE (#define cudaMemAttachHost 0x02) + cuda2hipRename["CU_MEM_ATTACH_SINGLE"] = {"hipMemAttachSingle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x4 // API_Runtime ANALOGUE (#define cudaMemAttachSingle 0x04) + + // enum CUmemorytype/CUmemorytype_enum + cuda2hipRename["CUmemorytype"] = {"hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no - cudaMemoryType is not an analogue) + cuda2hipRename["CUmemorytype_enum"] = {"hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no - cudaMemoryType is not an analogue) + cuda2hipRename["CU_MEMORYTYPE_HOST"] = {"hipMemTypeHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_MEMORYTYPE_DEVICE"] = {"hipMemTypeDevice", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_MEMORYTYPE_ARRAY"] = {"hipMemTypeArray", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_MEMORYTYPE_UNIFIED"] = {"hipMemTypeUnified", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 // API_Runtime ANALOGUE (no) + + // enum CUresourcetype + cuda2hipRename["CUresourcetype"] = {"hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaResourceType) + cuda2hipRename["CUresourcetype_enum"] = {"hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaResourceType) + cuda2hipRename["CU_RESOURCE_TYPE_ARRAY"] = {"hipResourceTypeArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaResourceTypeArray = 0x00) + cuda2hipRename["CU_RESOURCE_TYPE_MIPMAPPED_ARRAY"] = {"hipResourceTypeMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaResourceTypeMipmappedArray = 0x01) + cuda2hipRename["CU_RESOURCE_TYPE_LINEAR"] = {"hipResourceTypeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaResourceTypeLinear = 0x02) + cuda2hipRename["CU_RESOURCE_TYPE_PITCH2D"] = {"hipResourceTypePitch2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaResourceTypePitch2D = 0x03) + + // enum CUresourceViewFormat/CUresourceViewFormat_enum + cuda2hipRename["CUresourceViewFormat"] = {"hipResourceViewFormat", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaResourceViewFormat) + cuda2hipRename["CUresourceViewFormat_enum"] = {"hipResourceViewFormat", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaResourceViewFormat) + cuda2hipRename["CU_RES_VIEW_FORMAT_NONE"] = {"hipResViewFormatNone", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaResViewFormatNone = 0x00) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_1X8"] = {"hipResViewFormatUnsignedChar1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedChar1 = 0x01) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_2X8"] = {"hipResViewFormatUnsignedChar2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedChar2 = 0x02) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_4X8"] = {"hipResViewFormatUnsignedChar4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedChar4 = 0x03) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_1X8"] = {"hipResViewFormatSignedChar1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 // API_Runtime ANALOGUE (cudaResViewFormatSignedChar1 = 0x04) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_2X8"] = {"hipResViewFormatSignedChar2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x05 // API_Runtime ANALOGUE (cudaResViewFormatSignedChar2 = 0x05) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_4X8"] = {"hipResViewFormatSignedChar4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x06 // API_Runtime ANALOGUE (cudaResViewFormatSignedChar4 = 0x06) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_1X16"] = {"hipResViewFormatUnsignedShort1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedShort1 = 0x07) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_2X16"] = {"hipResViewFormatUnsignedShort2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedShort2 = 0x08) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_4X16"] = {"hipResViewFormatUnsignedShort4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x09 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedShort4 = 0x09) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_1X16"] = {"hipResViewFormatSignedShort1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a // API_Runtime ANALOGUE (cudaResViewFormatSignedShort1 = 0x0a) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_2X16"] = {"hipResViewFormatSignedShort2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0b // API_Runtime ANALOGUE (cudaResViewFormatSignedShort2 = 0x0b) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_4X16"] = {"hipResViewFormatSignedShort4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0c // API_Runtime ANALOGUE (cudaResViewFormatSignedShort4 = 0x0c) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_1X32"] = {"hipResViewFormatUnsignedInt1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0d // API_Runtime ANALOGUE (cudaResViewFormatUnsignedInt1 = 0x0d) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_2X32"] = {"hipResViewFormatUnsignedInt2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0e // API_Runtime ANALOGUE (cudaResViewFormatUnsignedInt2 = 0x0e) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_4X32"] = {"hipResViewFormatUnsignedInt4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0f // API_Runtime ANALOGUE (cudaResViewFormatUnsignedInt4 = 0x0f) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_1X32"] = {"hipResViewFormatSignedInt1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 // API_Runtime ANALOGUE (cudaResViewFormatSignedInt1 = 0x10) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_2X32"] = {"hipResViewFormatSignedInt2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x11 // API_Runtime ANALOGUE (cudaResViewFormatSignedInt2 = 0x11) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_4X32"] = {"hipResViewFormatSignedInt4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x12 // API_Runtime ANALOGUE (cudaResViewFormatSignedInt4 = 0x12) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_1X16"] = {"hipResViewFormatHalf1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x13 // API_Runtime ANALOGUE (cudaResViewFormatHalf1 = 0x13) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_2X16"] = {"hipResViewFormatHalf2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x14 // API_Runtime ANALOGUE (cudaResViewFormatHalf2 = 0x14) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_4X16"] = {"hipResViewFormatHalf4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x15 // API_Runtime ANALOGUE (cudaResViewFormatHalf4 = 0x15) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_1X32"] = {"hipResViewFormatFloat1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x16 // API_Runtime ANALOGUE (cudaResViewFormatFloat1 = 0x16) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_2X32"] = {"hipResViewFormatFloat2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x17 // API_Runtime ANALOGUE (cudaResViewFormatFloat2 = 0x17) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_4X32"] = {"hipResViewFormatFloat4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x18 // API_Runtime ANALOGUE (cudaResViewFormatFloat4 = 0x18) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC1"] = {"hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x19 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed1 = 0x19) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC2"] = {"hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1a // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed2 = 0x1a) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC3"] = {"hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1b // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed3 = 0x1b) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC4"] = {"hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1c // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed4 = 0x1c) + cuda2hipRename["CU_RES_VIEW_FORMAT_SIGNED_BC4"] = {"hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1d // API_Runtime ANALOGUE (cudaResViewFormatSignedBlockCompressed4 = 0x1d) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC5"] = {"hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1e // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed5 = 0x1e) + cuda2hipRename["CU_RES_VIEW_FORMAT_SIGNED_BC5"] = {"hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f // API_Runtime ANALOGUE (cudaResViewFormatSignedBlockCompressed5 = 0x1f) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC6H"] = {"hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed6H = 0x20) + cuda2hipRename["CU_RES_VIEW_FORMAT_SIGNED_BC6H"] = {"hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x21 // API_Runtime ANALOGUE (cudaResViewFormatSignedBlockCompressed6H = 0x21) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC7"] = {"hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x22 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed7 = 0x22) + + + + cuda2hipRename["CUsharedconfig_enum"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUsharedconfig"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE"] = {"hipSharedMemBankSizeDefault", CONV_DEV, API_DRIVER}; + cuda2hipRename["CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_DRIVER}; + cuda2hipRename["CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_DRIVER}; + + cuda2hipRename["CUcontext"] = {"hipCtx_t", CONV_TYPE, API_DRIVER}; // TODO: // cuda2hipRename["CUctx_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUmodule"] = {"hipModule_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUmodule"] = {"hipModule_t", CONV_TYPE, API_DRIVER}; // TODO: // cuda2hipRename["CUmod_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUstream"] = {"hipStream_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUstream"] = {"hipStream_t", CONV_TYPE, API_DRIVER}; // TODO: // cuda2hipRename["CUstream_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; // Stream Flags - cuda2hipRename["CU_STREAM_DEFAULT"] = {"hipStreamDefault", CONV_STREAM, API_DRIVER}; - cuda2hipRename["CU_STREAM_NON_BLOCKING"] = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER}; + cuda2hipRename["CU_STREAM_DEFAULT"] = {"hipStreamDefault", CONV_STREAM, API_DRIVER}; + cuda2hipRename["CU_STREAM_NON_BLOCKING"] = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER}; // Init - cuda2hipRename["cuInit"] = {"hipInit", CONV_DRIVER, API_DRIVER}; + cuda2hipRename["cuInit"] = {"hipInit", CONV_DRIVER, API_DRIVER}; // Driver cuda2hipRename["cuDriverGetVersion"] = {"hipDriverGetVersion", CONV_DRIVER, API_DRIVER}; @@ -568,14 +816,17 @@ struct cuda2hipMap { // Events // pointer to CUevent_st cuda2hipRename["CUevent"] = {"hipEvent_t", CONV_TYPE, API_DRIVER}; - // ToDO: - // cuda2hipRename["CUevent_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; + // ToDo: + // cuda2hipRename["CUevent_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; // Event Flags + cuda2hipRename["CUevent_flags"] = {"hipEventFlags", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + // ToDo: + // cuda2hipRename["CUevent_flags_enum"] = {"hipEventFlags", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_EVENT_DEFAULT"] = {"hipEventDefault", CONV_EVENT, API_DRIVER}; cuda2hipRename["CU_EVENT_BLOCKING_SYNC"] = {"hipEventBlockingSync", CONV_EVENT, API_DRIVER}; cuda2hipRename["CU_EVENT_DISABLE_TIMING"] = {"hipEventDisableTiming", CONV_EVENT, API_DRIVER}; cuda2hipRename["CU_EVENT_INTERPROCESS"] = {"hipEventInterprocess", CONV_EVENT, API_DRIVER}; - + // Event functions cuda2hipRename["cuEventCreate"] = {"hipEventCreate", CONV_EVENT, API_DRIVER}; cuda2hipRename["cuEventDestroy_v2"] = {"hipEventDestroy", CONV_EVENT, API_DRIVER}; cuda2hipRename["cuEventElapsedTime"] = {"hipEventElapsedTime", CONV_EVENT, API_DRIVER}; @@ -627,7 +878,7 @@ struct cuda2hipMap { cuda2hipRename["cuMemsetD16_v2"] = {"hipMemsetD16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD16Async"] = {"hipMemsetD16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D16_v2"] = {"hipMemsetD2D16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemsetD2D16Async"] = {"hipMemsetD2D16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemsetD2D16Async"] = {"hipMemsetD2D16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD32_v2"] = {"hipMemset", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemsetD32Async"] = {"hipMemsetAsync", CONV_MEM, API_DRIVER}; @@ -639,6 +890,14 @@ struct cuda2hipMap { cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; + // Texture Reference Mngmnt + // Texture reference filtering modes + cuda2hipRename["CUfilter_mode"] = {"hipTextureFilterMode", CONV_TEX, API_DRIVER}; // API_Runtime ANALOGUE (cudaTextureFilterMode) + // ToDo: + // cuda2hipRename["CUfilter_mode"] = {"CUfilter_mode_enum", CONV_TEX, API_DRIVER}; // API_Runtime ANALOGUE (cudaTextureFilterMode) + cuda2hipRename["CU_TR_FILTER_MODE_POINT"] = {"hipFilterModePoint", CONV_TEX, API_DRIVER}; // 0 // API_Runtime ANALOGUE (cudaFilterModePoint = 0) + cuda2hipRename["CU_TR_FILTER_MODE_LINEAR"] = {"hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaFilterModeLinear = 1) + // Profiler // unsupported yet by HIP cuda2hipRename["cuProfilerInitialize"] = {"hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED}; @@ -676,6 +935,14 @@ struct cuda2hipMap { cuda2hipRename["MINOR_VERSION"] = {"hipLibraryMinorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["PATCH_LEVEL"] = {"hipLibraryPatchVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; + // defines + cuda2hipRename["cudaMemAttachGlobal"] = {"hipMemAttachGlobal", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_MEM_ATTACH_GLOBAL = 0x1) + cuda2hipRename["cudaMemAttachHost"] = {"hipMemAttachHost", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Driver ANALOGUE (CU_MEM_ATTACH_HOST = 0x2) + cuda2hipRename["cudaMemAttachSingle"] = {"hipMemAttachSingle", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x04 // API_Driver ANALOGUE (CU_MEM_ATTACH_SINGLE = 0x4) + + cuda2hipRename["cudaOccupancyDefault"] = {"hipOccupancyDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_OCCUPANCY_DEFAULT = 0x0) + cuda2hipRename["cudaOccupancyDisableCachingOverride"] = {"hipOccupancyDisableCachingOverride", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1) + // Error API cuda2hipRename["cudaGetLastError"] = {"hipGetLastError", CONV_ERR, API_RUNTIME}; cuda2hipRename["cudaPeekAtLastError"] = {"hipPeekAtLastError", CONV_ERR, API_RUNTIME}; @@ -766,7 +1033,7 @@ struct cuda2hipMap { cuda2hipRename["cudaHostAlloc"] = {"hipHostMalloc", CONV_MEM, API_RUNTIME}; // Memory types - cuda2hipRename["cudaMemoryType"] = {"hipMemoryType", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemoryType"] = {"hipMemoryType", CONV_MEM, API_RUNTIME}; // API_Driver ANALOGUE (no - CUmemorytype is not an analogue) cuda2hipRename["cudaMemoryTypeHost"] = {"hipMemoryTypeHost", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemoryTypeDevice"] = {"hipMemoryTypeDevice", CONV_MEM, API_RUNTIME}; @@ -838,7 +1105,6 @@ struct cuda2hipMap { cuda2hipRename["cudaEventBlockingSync"] = {"hipEventBlockingSync", CONV_EVENT, API_RUNTIME}; cuda2hipRename["cudaEventDisableTiming"] = {"hipEventDisableTiming", CONV_EVENT, API_RUNTIME}; cuda2hipRename["cudaEventInterprocess"] = {"hipEventInterprocess", CONV_EVENT, API_RUNTIME}; - // Streams cuda2hipRename["cudaStream_t"] = {"hipStream_t", CONV_TYPE, API_RUNTIME}; cuda2hipRename["cudaStreamCreate"] = {"hipStreamCreate", CONV_STREAM, API_RUNTIME}; @@ -874,93 +1140,94 @@ struct cuda2hipMap { // Attributes cuda2hipRename["cudaDeviceGetAttribute"] = {"hipDeviceGetAttribute", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxBlockDimX"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxBlockDimY"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxBlockDimZ"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxGridDimX"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxGridDimY"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxGridDimZ"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrTotalConstantMemory"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrWarpSize"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrClockRate"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMemoryClockRate"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMultiProcessorCount"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrComputeMode"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrL2CacheSize"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrComputeCapabilityMajor"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrComputeCapabilityMinor"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrConcurrentKernels"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrPciBusId"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrPciDeviceId"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrIsMultiGpuBoard"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME}; - // unsupported yet by HIP - cuda2hipRename["cudaDevAttrMaxPitch"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrTextureAlignment"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME}; // API_DRIVER ANALOGUE (CUdevice_attribute) + cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME}; // 1 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1) + cuda2hipRename["cudaDevAttrMaxBlockDimX"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME}; // 2 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2) + cuda2hipRename["cudaDevAttrMaxBlockDimY"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME}; // 3 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3) + cuda2hipRename["cudaDevAttrMaxBlockDimZ"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME}; // 4 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4) + cuda2hipRename["cudaDevAttrMaxGridDimX"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME}; // 5 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5) + cuda2hipRename["cudaDevAttrMaxGridDimY"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME}; // 6 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 6) + cuda2hipRename["cudaDevAttrMaxGridDimZ"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME}; // 7 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 7) + cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME}; // 8 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8) + cuda2hipRename["cudaDevAttrTotalConstantMemory"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME}; // 9 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY =9) + cuda2hipRename["cudaDevAttrWarpSize"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME}; // 10 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10) + cuda2hipRename["cudaDevAttrMaxPitch"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 11 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11) + cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME}; // 12 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12) + cuda2hipRename["cudaDevAttrClockRate"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME}; // 13 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13) + cuda2hipRename["cudaDevAttrTextureAlignment"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 14 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14) // Is not deprecated as CUDA Driver's API analogue CU_DEVICE_ATTRIBUTE_GPU_OVERLAP - cuda2hipRename["cudaDevAttrGpuOverlap"] = {"hipDeviceAttributeGpuOverlap", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrKernelExecTimeout"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrIntegrated"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrCanMapHostMemory"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture1DWidth"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DWidth"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DHeight"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DWidth"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DHeight"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DDepth"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLayeredWidth"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLayeredHeight"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLayeredLayers"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrSurfaceAlignment"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrEccEnabled"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrTccDriver"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrAsyncEngineCount"] = {"hipDevAttrAsyncEngineCount", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrUnifiedAddressing"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture1DLayeredWidth"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture1DLayeredLayers"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DGatherWidth"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DGatherHeight"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DWidthAlt"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DHeightAlt"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DDepthAlt"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrPciDomainId"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrTexturePitchAlignment"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTextureCubemapWidth"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredWidth"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredLayers"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface1DWidth"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface2DWidth"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface2DHeight"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface3DWidth"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface3DHeight"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface3DDepth"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface1DLayeredWidth"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface1DLayeredLayers"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface2DLayeredWidth"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface2DLayeredHeight"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface2DLayeredLayers"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurfaceCubemapWidth"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredWidth"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredLayers"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture1DLinearWidth"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLinearWidth"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLinearHeight"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLinearPitch"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedWidth"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedHeight"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture1DMipmappedWidth"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrStreamPrioritiesSupported"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrGlobalL1CacheSupported"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrLocalL1CacheSupported"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxRegistersPerMultiprocessor"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrManagedMemory"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMultiGpuBoardGroupID"] = {"hipDeviceAttributeMultiGpuBoardGroupID", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaDevAttrGpuOverlap"] = {"hipDeviceAttributeGpuOverlap", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 15 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15) + cuda2hipRename["cudaDevAttrMultiProcessorCount"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME}; // 16 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16) + cuda2hipRename["cudaDevAttrKernelExecTimeout"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 17 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17) + cuda2hipRename["cudaDevAttrIntegrated"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 18 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_INTEGRATED = 18) + cuda2hipRename["cudaDevAttrCanMapHostMemory"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 19 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19) + cuda2hipRename["cudaDevAttrComputeMode"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME}; // 20 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20) + cuda2hipRename["cudaDevAttrMaxTexture1DWidth"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 21 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21) + cuda2hipRename["cudaDevAttrMaxTexture2DWidth"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 22 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22) + cuda2hipRename["cudaDevAttrMaxTexture2DHeight"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 23 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23) + cuda2hipRename["cudaDevAttrMaxTexture3DWidth"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 24 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24) + cuda2hipRename["cudaDevAttrMaxTexture3DHeight"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 25 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25) + cuda2hipRename["cudaDevAttrMaxTexture3DDepth"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 26 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26) + cuda2hipRename["cudaDevAttrMaxTexture2DLayeredWidth"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 27 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27) + cuda2hipRename["cudaDevAttrMaxTexture2DLayeredHeight"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 28 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28) + cuda2hipRename["cudaDevAttrMaxTexture2DLayeredLayers"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 29 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29) + cuda2hipRename["cudaDevAttrSurfaceAlignment"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 30 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30) + cuda2hipRename["cudaDevAttrConcurrentKernels"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME}; // 31 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31) + cuda2hipRename["cudaDevAttrEccEnabled"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 32 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32) + cuda2hipRename["cudaDevAttrPciBusId"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME}; // 33 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33) + cuda2hipRename["cudaDevAttrPciDeviceId"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME}; // 34 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34) + cuda2hipRename["cudaDevAttrTccDriver"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 35 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35) + cuda2hipRename["cudaDevAttrMemoryClockRate"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME}; // 36 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36) + cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME}; // 37 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37) + cuda2hipRename["cudaDevAttrL2CacheSize"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME}; // 38 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38) + cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME}; // 39 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39) + cuda2hipRename["cudaDevAttrAsyncEngineCount"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 40 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40) + cuda2hipRename["cudaDevAttrUnifiedAddressing"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 41 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41) + cuda2hipRename["cudaDevAttrMaxTexture1DLayeredWidth"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 42 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42) + cuda2hipRename["cudaDevAttrMaxTexture1DLayeredLayers"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 43 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43) + // 44 - no + cuda2hipRename["cudaDevAttrMaxTexture2DGatherWidth"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 45 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45) + cuda2hipRename["cudaDevAttrMaxTexture2DGatherHeight"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 46 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46) + cuda2hipRename["cudaDevAttrMaxTexture3DWidthAlt"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 47 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47) + cuda2hipRename["cudaDevAttrMaxTexture3DHeightAlt"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 48 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48) + cuda2hipRename["cudaDevAttrMaxTexture3DDepthAlt"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 49 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49) + cuda2hipRename["cudaDevAttrPciDomainId"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 50 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50) + cuda2hipRename["cudaDevAttrTexturePitchAlignment"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 51 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51) + cuda2hipRename["cudaDevAttrMaxTextureCubemapWidth"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 52 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52) + cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredWidth"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 53 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53) + cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredLayers"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 54 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54) + cuda2hipRename["cudaDevAttrMaxSurface1DWidth"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 55 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55) + cuda2hipRename["cudaDevAttrMaxSurface2DWidth"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 56 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56) + cuda2hipRename["cudaDevAttrMaxSurface2DHeight"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 57 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57) + cuda2hipRename["cudaDevAttrMaxSurface3DWidth"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 58 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58) + cuda2hipRename["cudaDevAttrMaxSurface3DHeight"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 59 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59) + cuda2hipRename["cudaDevAttrMaxSurface3DDepth"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 60 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60) + cuda2hipRename["cudaDevAttrMaxSurface1DLayeredWidth"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 61 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61) + cuda2hipRename["cudaDevAttrMaxSurface1DLayeredLayers"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 62 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62) + cuda2hipRename["cudaDevAttrMaxSurface2DLayeredWidth"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 63 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63) + cuda2hipRename["cudaDevAttrMaxSurface2DLayeredHeight"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 64 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64) + cuda2hipRename["cudaDevAttrMaxSurface2DLayeredLayers"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 65 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65) + cuda2hipRename["cudaDevAttrMaxSurfaceCubemapWidth"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 66 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66) + cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredWidth"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 67 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67) + cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredLayers"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 68 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68) + cuda2hipRename["cudaDevAttrMaxTexture1DLinearWidth"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 69 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69) + cuda2hipRename["cudaDevAttrMaxTexture2DLinearWidth"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 70 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70) + cuda2hipRename["cudaDevAttrMaxTexture2DLinearHeight"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 71 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71) + cuda2hipRename["cudaDevAttrMaxTexture2DLinearPitch"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 72 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72) + cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedWidth"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 73 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73) + cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedHeight"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 74 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74) + cuda2hipRename["cudaDevAttrComputeCapabilityMajor"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME}; // 75 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75) + cuda2hipRename["cudaDevAttrComputeCapabilityMinor"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_RUNTIME}; // 76 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76) + cuda2hipRename["cudaDevAttrMaxTexture1DMipmappedWidth"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 77 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77) + cuda2hipRename["cudaDevAttrStreamPrioritiesSupported"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 78 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78) + cuda2hipRename["cudaDevAttrGlobalL1CacheSupported"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 79 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79) + cuda2hipRename["cudaDevAttrLocalL1CacheSupported"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 80 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80) + cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME}; // 81 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81) + cuda2hipRename["cudaDevAttrMaxRegistersPerMultiprocessor"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 82 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82) + cuda2hipRename["cudaDevAttrManagedMemory"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 83 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83) + cuda2hipRename["cudaDevAttrIsMultiGpuBoard"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME}; // 84 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84) + cuda2hipRename["cudaDevAttrMultiGpuBoardGroupID"] = {"hipDeviceAttributeMultiGpuBoardGroupID", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 85 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85) + // unsupported yet by HIP [CUDA 8.0.44] cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; @@ -970,10 +1237,11 @@ struct cuda2hipMap { cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // Pointer Attributes - cuda2hipRename["cudaPointerAttributes"] = {"hipPointerAttribute_t", CONV_TYPE, API_RUNTIME}; - cuda2hipRename["cudaPointerGetAttributes"] = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME}; + // struct cudaPointerAttributes + cuda2hipRename["cudaPointerAttributes"] = {"hipPointerAttribute_t", CONV_TYPE, API_RUNTIME}; + cuda2hipRename["cudaPointerGetAttributes"] = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME}; - cuda2hipRename["cudaHostGetDevicePointer"] = {"hipHostGetDevicePointer", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostGetDevicePointer"] = {"hipHostGetDevicePointer", CONV_MEM, API_RUNTIME}; // Device cuda2hipRename["cudaDeviceProp"] = {"hipDeviceProp_t", CONV_TYPE, API_RUNTIME}; @@ -985,11 +1253,11 @@ struct cuda2hipMap { cuda2hipRename["cudaSetValidDevices"] = {"hipSetValidDevices", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // Compute mode - cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaComputeModeExclusive"] = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaComputeModeProhibited"] = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaComputeModeExclusiveProcess"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) + cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 + cuda2hipRename["cudaComputeModeExclusive"] = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["cudaComputeModeProhibited"] = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["cudaComputeModeExclusiveProcess"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // Device Flags // unsupported yet by HIP @@ -1020,11 +1288,11 @@ struct cuda2hipMap { // Execution control // CUDA function cache configurations - cuda2hipRename["cudaFuncCache"] = {"hipFuncCache_t", CONV_CACHE, API_RUNTIME}; - cuda2hipRename["cudaFuncCachePreferNone"] = {"hipFuncCachePreferNone", CONV_CACHE, API_RUNTIME}; - cuda2hipRename["cudaFuncCachePreferShared"] = {"hipFuncCachePreferShared", CONV_CACHE, API_RUNTIME}; - cuda2hipRename["cudaFuncCachePreferL1"] = {"hipFuncCachePreferL1", CONV_CACHE, API_RUNTIME}; - cuda2hipRename["cudaFuncCachePreferEqual"] = {"hipFuncCachePreferEqual", CONV_CACHE, API_RUNTIME}; + cuda2hipRename["cudaFuncCache"] = {"hipFuncCache_t", CONV_CACHE, API_RUNTIME}; // API_Driver ANALOGUE (CUfunc_cache) + cuda2hipRename["cudaFuncCachePreferNone"] = {"hipFuncCachePreferNone", CONV_CACHE, API_RUNTIME}; // 0 // API_Driver ANALOGUE (CU_FUNC_CACHE_PREFER_NONE = 0x00) + cuda2hipRename["cudaFuncCachePreferShared"] = {"hipFuncCachePreferShared", CONV_CACHE, API_RUNTIME}; // 1 // API_Driver ANALOGUE (CU_FUNC_CACHE_PREFER_SHARED = 0x01) + cuda2hipRename["cudaFuncCachePreferL1"] = {"hipFuncCachePreferL1", CONV_CACHE, API_RUNTIME}; // 2 // API_Driver ANALOGUE (CU_FUNC_CACHE_PREFER_L1 = 0x02) + cuda2hipRename["cudaFuncCachePreferEqual"] = {"hipFuncCachePreferEqual", CONV_CACHE, API_RUNTIME}; // 3 // API_Driver ANALOGUE (CU_FUNC_CACHE_PREFER_EQUAL = 0x03) // Execution control functions // unsupported yet by HIP @@ -1062,7 +1330,9 @@ struct cuda2hipMap { cuda2hipRename["cudaDeviceEnablePeerAccess"] = {"hipDeviceEnablePeerAccess", CONV_DEV, API_RUNTIME}; cuda2hipRename["cudaMemcpyPeerAsync"] = {"hipMemcpyPeerAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpyPeer"] = {"hipMemcpyPeer", CONV_MEM, API_RUNTIME}; - cuda2hipRename["cudaIpcMemLazyEnablePeerAccess"] = {"hipIpcMemLazyEnablePeerAccess", CONV_ERR, API_RUNTIME}; + + // #define cudaIpcMemLazyEnablePeerAccess 0x01 + cuda2hipRename["cudaIpcMemLazyEnablePeerAccess"] = {"hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_RUNTIME}; // 0x01 // API_Driver ANALOGUE (CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1) // Shared memory cuda2hipRename["cudaDeviceSetSharedMemConfig"] = {"hipDeviceSetSharedMemConfig", CONV_DEV, API_RUNTIME}; @@ -1078,14 +1348,12 @@ struct cuda2hipMap { cuda2hipRename["cudaSharedMemBankSizeEightByte"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_RUNTIME}; // Limits - cuda2hipRename["cudaLimit"] = {"hipLimit_t", CONV_DEV, API_RUNTIME}; - // unsupported yet by HIP - cuda2hipRename["cudaLimitStackSize"] = {"hipLimitStackSize", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaLimitPrintfFifoSize"] = {"hipLimitPrintfFifoSize", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaLimitMallocHeapSize"] = {"hipLimitMallocHeapSize", CONV_DEV, API_RUNTIME}; - // unsupported yet by HIP - cuda2hipRename["cudaLimitDevRuntimeSyncDepth"] = {"hipLimitPrintfFifoSize", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaLimitDevRuntimePendingLaunchCount"] = {"hipLimitMallocHeapSize", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaLimit"] = {"hipLimit_t", CONV_TYPE, API_RUNTIME}; // API_Driver ANALOGUE (CUlimit) + cuda2hipRename["cudaLimitStackSize"] = {"hipLimitStackSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_LIMIT_STACK_SIZE = 0x00) + cuda2hipRename["cudaLimitPrintfFifoSize"] = {"hipLimitPrintfFifoSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_LIMIT_PRINTF_FIFO_SIZE = 0x01) + cuda2hipRename["cudaLimitMallocHeapSize"] = {"hipLimitMallocHeapSize", CONV_TYPE, API_RUNTIME}; // 0x02 // API_Driver ANALOGUE (CU_LIMIT_MALLOC_HEAP_SIZE = 0x02) + cuda2hipRename["cudaLimitDevRuntimeSyncDepth"] = {"hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Driver ANALOGUE (CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03) + cuda2hipRename["cudaLimitDevRuntimePendingLaunchCount"] = {"hipLimitDevRuntimePendingLaunchCount", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x04 // API_Driver ANALOGUE (CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04) cuda2hipRename["cudaDeviceGetLimit"] = {"hipDeviceGetLimit", CONV_DEV, API_RUNTIME}; @@ -1108,10 +1376,9 @@ struct cuda2hipMap { // unsupported yet by HIP cuda2hipRename["cudaReadModeNormalizedFloat"] = {"hipReadModeNormalizedFloat", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaTextureFilterMode"] = {"hipTextureFilterMode", CONV_TEX, API_RUNTIME}; - cuda2hipRename["cudaFilterModePoint"] = {"hipFilterModePoint", CONV_TEX, API_RUNTIME}; - // unsupported yet by HIP - cuda2hipRename["cudaFilterModeLinear"] = {"hipFilterModeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaTextureFilterMode"] = {"hipTextureFilterMode", CONV_TEX, API_RUNTIME}; // API_DRIVER ANALOGUE (CUfilter_mode) + cuda2hipRename["cudaFilterModePoint"] = {"hipFilterModePoint", CONV_TEX, API_RUNTIME}; // 0 // API_DRIVER ANALOGUE (CU_TR_FILTER_MODE_POINT = 0) + cuda2hipRename["cudaFilterModeLinear"] = {"hipFilterModeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_DRIVER ANALOGUE (CU_TR_FILTER_MODE_POINT = 1) cuda2hipRename["cudaBindTexture"] = {"hipBindTexture", CONV_TEX, API_RUNTIME}; cuda2hipRename["cudaUnbindTexture"] = {"hipUnbindTexture", CONV_TEX, API_RUNTIME}; @@ -1131,7 +1398,7 @@ struct cuda2hipMap { cuda2hipRename["cudaChannelFormatDesc"] = {"hipChannelFormatDesc", CONV_TEX, API_RUNTIME}; cuda2hipRename["cudaCreateChannelDesc"] = {"hipCreateChannelDesc", CONV_TEX, API_RUNTIME}; // unsupported yet by HIP - cuda2hipRename["cudaGetChannelDesc"] = {"hipGetChannelDesc", CONV_TEX, API_RUNTIME}; + cuda2hipRename["cudaGetChannelDesc"] = {"hipGetChannelDesc", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // Texture Object Management // structs @@ -1139,49 +1406,52 @@ struct cuda2hipMap { cuda2hipRename["cudaResourceDesc"] = {"hipResourceDesc", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaResourceViewDesc"] = {"hipResourceViewDesc", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaTextureDesc"] = {"hipTextureDesc", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - // enums - // unsupported yet by HIP - cuda2hipRename["cudaResourceType"] = {"hipResourceType", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResourceTypeArray"] = {"hipResourceTypeArray", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResourceTypeMipmappedArray"] = {"hipResourceTypeMipmappedArray", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResourceTypeLinear"] = {"hipResourceTypeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResourceTypePitch2D"] = {"hipResourceTypePitch2D", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResourceViewFormat"] = {"hipResourceViewFormat", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatNone"] = {"hipResViewFormatNone", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedChar1"] = {"hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedChar2"] = {"hipResViewFormatUnsignedChar2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedChar4"] = {"hipResViewFormatUnsignedChar4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedChar1"] = {"hipResViewFormatSignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedChar2"] = {"hipResViewFormatSignedChar2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedChar4"] = {"hipResViewFormatSignedChar4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedShort1"] = {"hipResViewFormatUnsignedShort1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedShort2"] = {"hipResViewFormatUnsignedShort2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedShort4"] = {"hipResViewFormatUnsignedShort4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedShort1"] = {"hipResViewFormatSignedShort1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedShort2"] = {"hipResViewFormatSignedShort2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedShort4"] = {"hipResViewFormatSignedShort4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedInt1"] = {"hipResViewFormatUnsignedInt1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedInt2"] = {"hipResViewFormatUnsignedInt2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedInt4"] = {"hipResViewFormatUnsignedInt4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedInt1"] = {"hipResViewFormatSignedInt1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedInt2"] = {"hipResViewFormatSignedInt2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedInt4"] = {"hipResViewFormatSignedInt4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatHalf1"] = {"hipResViewFormatHalf1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatHalf2"] = {"hipResViewFormatHalf2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatHalf4"] = {"hipResViewFormatHalf4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatFloat1"] = {"hipResViewFormatFloat1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatFloat2"] = {"hipResViewFormatFloat2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatFloat4"] = {"hipResViewFormatFloat4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed1"] = {"hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed2"] = {"hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed3"] = {"hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed4"] = {"hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedBlockCompressed4"] = {"hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed5"] = {"hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedBlockCompressed5"] = {"hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedBlockCompressed6H"] = {"hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed7"] = {"hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; + // enums + // enum cudaResourceType + cuda2hipRename["cudaResourceType"] = {"hipResourceType", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUresourcetype) + cuda2hipRename["cudaResourceTypeArray"] = {"hipResourceTypeArray", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_ARRAY = 0x00) + cuda2hipRename["cudaResourceTypeMipmappedArray"] = {"hipResourceTypeMipmappedArray", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01) + cuda2hipRename["cudaResourceTypeLinear"] = {"hipResourceTypeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_LINEAR = 0x02) + cuda2hipRename["cudaResourceTypePitch2D"] = {"hipResourceTypePitch2D", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_PITCH2D = 0x03) + + + cuda2hipRename["cudaResourceViewFormat"] = {"hipResourceViewFormat", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUresourceViewFormat) + cuda2hipRename["cudaResViewFormatNone"] = {"hipResViewFormatNone", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_NONE = 0x00) + cuda2hipRename["cudaResViewFormatUnsignedChar1"] = {"hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01) + cuda2hipRename["cudaResViewFormatUnsignedChar2"] = {"hipResViewFormatUnsignedChar2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02) + cuda2hipRename["cudaResViewFormatUnsignedChar4"] = {"hipResViewFormatUnsignedChar4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03) + cuda2hipRename["cudaResViewFormatSignedChar1"] = {"hipResViewFormatSignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x04 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04) + cuda2hipRename["cudaResViewFormatSignedChar2"] = {"hipResViewFormatSignedChar2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x05 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05) + cuda2hipRename["cudaResViewFormatSignedChar4"] = {"hipResViewFormatSignedChar4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x06 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06) + cuda2hipRename["cudaResViewFormatUnsignedShort1"] = {"hipResViewFormatUnsignedShort1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x07 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07) + cuda2hipRename["cudaResViewFormatUnsignedShort2"] = {"hipResViewFormatUnsignedShort2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x08 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08) + cuda2hipRename["cudaResViewFormatUnsignedShort4"] = {"hipResViewFormatUnsignedShort4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x09 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09) + cuda2hipRename["cudaResViewFormatSignedShort1"] = {"hipResViewFormatSignedShort1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0a // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a) + cuda2hipRename["cudaResViewFormatSignedShort2"] = {"hipResViewFormatSignedShort2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0b // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b) + cuda2hipRename["cudaResViewFormatSignedShort4"] = {"hipResViewFormatSignedShort4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0c // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c) + cuda2hipRename["cudaResViewFormatUnsignedInt1"] = {"hipResViewFormatUnsignedInt1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0d // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d) + cuda2hipRename["cudaResViewFormatUnsignedInt2"] = {"hipResViewFormatUnsignedInt2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0e // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e) + cuda2hipRename["cudaResViewFormatUnsignedInt4"] = {"hipResViewFormatUnsignedInt4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0f // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f) + cuda2hipRename["cudaResViewFormatSignedInt1"] = {"hipResViewFormatSignedInt1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x10 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10) + cuda2hipRename["cudaResViewFormatSignedInt2"] = {"hipResViewFormatSignedInt2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x11 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11) + cuda2hipRename["cudaResViewFormatSignedInt4"] = {"hipResViewFormatSignedInt4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x12 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12) + cuda2hipRename["cudaResViewFormatHalf1"] = {"hipResViewFormatHalf1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x13 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13) + cuda2hipRename["cudaResViewFormatHalf2"] = {"hipResViewFormatHalf2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x14 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14) + cuda2hipRename["cudaResViewFormatHalf4"] = {"hipResViewFormatHalf4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x15 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15) + cuda2hipRename["cudaResViewFormatFloat1"] = {"hipResViewFormatFloat1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x16 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16) + cuda2hipRename["cudaResViewFormatFloat2"] = {"hipResViewFormatFloat2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x17 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17) + cuda2hipRename["cudaResViewFormatFloat4"] = {"hipResViewFormatFloat4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x18 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed1"] = {"hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x19 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed2"] = {"hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1a // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed3"] = {"hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1b // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed4"] = {"hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1c // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c) + cuda2hipRename["cudaResViewFormatSignedBlockCompressed4"] = {"hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1d // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed5"] = {"hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1e // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e) + cuda2hipRename["cudaResViewFormatSignedBlockCompressed5"] = {"hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1f // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed6H"] = {"hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20) + cuda2hipRename["cudaResViewFormatSignedBlockCompressed6H"] = {"hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x21 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed7"] = {"hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x22 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22) cuda2hipRename["cudaTextureAddressMode"] = {"hipTextureAddressMode", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaAddressModeWrap"] = {"hipAddressModeWrap", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; @@ -1255,17 +1525,19 @@ struct cuda2hipMap { cuda2hipRename["cudaGraphicsCubeFacePositiveZ"] = {"hipGraphicsCubeFacePositiveZ", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaGraphicsCubeFaceNegativeZ"] = {"hipGraphicsCubeFaceNegativeZ", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsMapFlags"] = {"hipGraphicsMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsMapFlagsNone"] = {"hipGraphicsMapFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsMapFlagsReadOnly"] = {"hipGraphicsMapFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsMapFlagsWriteDiscard"] = {"hipGraphicsMapFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; + // enum cudaGraphicsMapFlags + cuda2hipRename["cudaGraphicsMapFlags"] = {"hipGraphicsMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUgraphicsMapResourceFlags) + cuda2hipRename["cudaGraphicsMapFlagsNone"] = {"hipGraphicsMapFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 0 // API_Driver ANALOGUE (CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00) + cuda2hipRename["cudaGraphicsMapFlagsReadOnly"] = {"hipGraphicsMapFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01) + cuda2hipRename["cudaGraphicsMapFlagsWriteDiscard"] = {"hipGraphicsMapFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02) - cuda2hipRename["cudaGraphicsRegisterFlags"] = {"hipGraphicsRegisterFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsRegisterFlagsNone"] = {"hipGraphicsRegisterFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsRegisterFlagsReadOnly"] = {"hipGraphicsRegisterFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsRegisterFlagsWriteDiscard"] = {"hipGraphicsRegisterFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsRegisterFlagsSurfaceLoadStore"] = {"hipGraphicsRegisterFlagsSurfaceLoadStore", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsRegisterFlagsTextureGather"] = {"hipGraphicsRegisterFlagsTextureGather", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; + // enum cudaGraphicsRegisterFlags + cuda2hipRename["cudaGraphicsRegisterFlags"] = {"hipGraphicsRegisterFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUgraphicsRegisterFlags) + cuda2hipRename["cudaGraphicsRegisterFlagsNone"] = {"hipGraphicsRegisterFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 0 // API_Driver ANALOGUE (CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00) + cuda2hipRename["cudaGraphicsRegisterFlagsReadOnly"] = {"hipGraphicsRegisterFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01) + cuda2hipRename["cudaGraphicsRegisterFlagsWriteDiscard"] = {"hipGraphicsRegisterFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02) + cuda2hipRename["cudaGraphicsRegisterFlagsSurfaceLoadStore"] = {"hipGraphicsRegisterFlagsSurfaceLoadStore", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04) + cuda2hipRename["cudaGraphicsRegisterFlagsTextureGather"] = {"hipGraphicsRegisterFlagsTextureGather", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 8 // API_Driver ANALOGUE (CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08) //---------------------------------------BLAS-------------------------------------// // Blas types From 080dd2e0d366f0d9a844870e8fa1190cdae0180a Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Sat, 22 Apr 2017 01:01:31 +0300 Subject: [PATCH 039/171] [HIPIFY] [DOC] Readme.md update: Ubuntu 16.04 support --- hipify-clang/README.md | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/hipify-clang/README.md b/hipify-clang/README.md index 850dfb3ffa..c0d74dbe48 100644 --- a/hipify-clang/README.md +++ b/hipify-clang/README.md @@ -13,24 +13,31 @@ `hipify-clang` is a clang-based tool which can automate the translation of CUDA source code into portable HIP C++. The tool can automatically add extra HIP arguments (notably the "hipLaunchParm" required at the beginning of every HIP kernel call). -`hipify-clang` has some additional dependencies explained below and can be built as a separate make step. The instructions below are specifically for **Ubuntu 14.04** +`hipify-clang` has some additional dependencies explained below and can be built as a separate make step. The instructions below are specifically for **Ubuntu 14.04** and **Ubuntu 16.04**. ### Build and install - Download and unpack clang+llvm 3.8 binary package preqrequisite. + +**Ubuntu 14.04**: ```shell wget http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz tar xvfJ clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz ``` +**Ubuntu 16.04**: +```shell +wget http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz +tar xvfJ clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz +``` - Enable build of hipify-clang and specify path to LLVM. -Note HIPIFY_CLANG_LLVM_DIR must be a full absolute path to the location extracted above. Here's an example assuming we extract the clang 3.8 package into ~/HIP/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/ +Note HIPIFY_CLANG_LLVM_DIR must be a full absolute path to the location extracted above. Here's an example assuming we extract the clang 3.8 package into ~/HIP/clang+llvm-3.8.0/ ```shell cd HIP mkdir build cd build -cmake -DHIPIFY_CLANG_LLVM_DIR=~/HIP/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/ -DCMAKE_BUILD_TYPE=Release .. +cmake -DHIPIFY_CLANG_LLVM_DIR=~/HIP/clang+llvm-3.8.0/ -DCMAKE_BUILD_TYPE=Release .. make make install ``` @@ -41,13 +48,20 @@ make install In the case when `hipify-clang` doesn't find cuda headers, it reports various errors about unknown keywords (e.g. '\__global\__'), API function names (e.g. 'cudaMalloc'), syntax (e.g. 'foo<<<1,n>>>(...)'), etc. -To install CUDA headers, download the "deb(network)" variant of the target installer from https://developer.nvidia.com/cuda-downloads. The commands below show how to download and install a recent version from http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb. +To install CUDA headers, download the "deb(network)" variant of the target installer. + +**Ubuntu 14.04**: ```shell wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb sudo dpkg -i cuda-repo-ubuntu1404_7.5-18_amd64.deb sudo apt-get update && sudo apt-get install cuda-minimal-build-7-5 cuda-curand-dev-7-5 ``` - +**Ubuntu 16.04**: +```shell +wget http://archive.ubuntu.com/ubuntu/pool/multiverse/n/nvidia-cuda-toolkit/nvidia-cuda-toolkit_7.5.18-0ubuntu1_amd64.deb +sudo dpkg -i nvidia-cuda-toolkit_7.5.18-0ubuntu1_amd64.deb +sudo apt-get update && sudo apt-get install cuda-minimal-build-7-5 cuda-curand-dev-7-5 +``` To set additional options like Language Selection (only "-x cuda" is supported), Preprocessor Definition (-D), Include Path (-I), etc., options delimiter "--" should be used before them, for instance: ```shell @@ -58,10 +72,11 @@ Delimiter "--" is used to separate hipify-clang options (before the delimiter) f Option "-x clang" is also worth specifying in order to convert source CUDA files with extensions other than standard extensions (*.cu, *.cuh). -#### Disclaimer +## Disclaimer The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD's products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale. AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. -Copyright (c) 2014-2016 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2014-2017 Advanced Micro Devices, Inc. All rights reserved. + From 36353a560f214fbcde9b762141112bc2a8fac6eb Mon Sep 17 00:00:00 2001 From: James Edwards Date: Fri, 21 Apr 2017 22:34:26 -0500 Subject: [PATCH 040/171] Properly link hip cmake file into top level lib directory. Change-Id: I2113a86ca6985f34fd0cfb091abdbce0f632cfc2 --- packaging/hip_hcc.postinst | 15 ++++++++++----- packaging/hip_hcc.prerm | 10 +++++++--- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/packaging/hip_hcc.postinst b/packaging/hip_hcc.postinst index 14179db767..e7d53b742b 100755 --- a/packaging/hip_hcc.postinst +++ b/packaging/hip_hcc.postinst @@ -8,17 +8,22 @@ popd () { } ROCMDIR=/opt/rocm -HIPDIR=$ROCMDIR/hip - -# Soft-link to libraries -HIPLIBFILES=$HIPDIR/lib/* ROCMLIBDIR=$ROCMDIR/lib +HIPDIR=$ROCMDIR/hip +HIPLIBDIR=$ROCMDIR/hip/lib + +# Soft-link to library files +HIPLIBFILES=$(ls -aF $HIPLIBDIR | grep -v [-/$]) mkdir -p $ROCMLIBDIR +mkdir -p $ROCMLIBDIR/cmake pushd $ROCMLIBDIR for f in $HIPLIBFILES do ln -s $f $(basename $f) done - ln -s $HIPDIR/lib/.hipInfo .hipInfo +# Make the hip cmake directory link. +pushd cmake +ln -s $HIPLIBDIR/cmake/hip hip +popd popd diff --git a/packaging/hip_hcc.prerm b/packaging/hip_hcc.prerm index dda313a3a4..ee64aea632 100755 --- a/packaging/hip_hcc.prerm +++ b/packaging/hip_hcc.prerm @@ -9,17 +9,21 @@ popd () { } ROCMDIR=/opt/rocm +ROCMLIBDIR=$ROCMDIR/lib HIPDIR=$ROCMDIR/hip +HIPLIBDIR=$ROCMDIR/hip/lib # Remove soft-links to libraries -HIPLIBFILES=$HIPDIR/lib/* -ROCMLIBDIR=$ROCMDIR/lib +HIPLIBFILES=$(ls -aF $HIPLIBDIR | grep -v [-/$]) pushd $ROCMLIBDIR for f in $HIPLIBFILES do rm $(basename $f) done -rm .hipInfo +pushd cmake +unlink hip +popd +rmdir --ignore-fail-on-non-empty cmake popd rmdir --ignore-fail-on-non-empty $ROCMLIBDIR From 4869bf5a7cd02ad0bce8057eebbbe098ac30963b Mon Sep 17 00:00:00 2001 From: James Edwards Date: Sat, 22 Apr 2017 15:54:14 -0500 Subject: [PATCH 041/171] Specify full path of hip libraries in link file. Change-Id: I49b788f3489e7abff6b11006ff97fdfca4e5942c --- packaging/hip_hcc.postinst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/hip_hcc.postinst b/packaging/hip_hcc.postinst index e7d53b742b..c7f9c3184c 100755 --- a/packaging/hip_hcc.postinst +++ b/packaging/hip_hcc.postinst @@ -19,7 +19,7 @@ mkdir -p $ROCMLIBDIR/cmake pushd $ROCMLIBDIR for f in $HIPLIBFILES do - ln -s $f $(basename $f) + ln -s $HIPLIBDIR/$f $(basename $f) done # Make the hip cmake directory link. pushd cmake From 3a519ee9e5ea1362c26d2ed9bed24316e893759b Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 24 Apr 2017 08:48:35 +0530 Subject: [PATCH 042/171] Updated release notes Change-Id: Ia98aff420ea9d488924dce8fe9168cec9da301ab --- RELEASE.md | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 34eab60833..21fd8da7bb 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -13,6 +13,52 @@ Upcoming: ## Revision History: +=================================================================================================== +Release: 1.0.17102 +Date: 2017.03.07 +- Lots of improvements to hipify-clang. +- Added HIP package config for cmake. +- Several bug fixes and documentation updates. + + +=================================================================================================== +Release: 1.0.17066 +Date: 2017.02.11 +- Improved support for math device functions. +- Added several half math device functions. +- Enabled support for CUDA 8.0 in hipify-clang. +- Lots of bug fixes and documentation updates. + + +=================================================================================================== +Release: 1.0.17015 +Date: 2017.01.06 +- Several improvements to the hipify-clang infrastructure. +- Refactored module and function APIs. +- HIP now defaults to linking against the shared runtime library. +- Documentation updates. + + +=================================================================================================== +Release: 1.0.16502 +Date: 2016.12.13 +- Added several fast math and packaged math instrincs +- Improved debug and profiler documentation +- Support for building and linking to HIP shared library +- Several improvements to hipify-clang +- Several bug fixes + + +=================================================================================================== +Release: 1.0.16461 +Date: 2016.11.14 +- Significant changes to the HIP Profiling APIs. Refer to the documentation for details +- Improvements to P2P support +- New API: hipDeviceGetByPCIBusId +- Several bug fixes in NV path +- hipModuleLaunch now works for multi-dim kernels + + =================================================================================================== Release:1.0 Date: 2016.11.8 From 72bcfa438b15bc43faf3a6b7e8251b58938f3b88 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Mon, 24 Apr 2017 15:24:16 -0500 Subject: [PATCH 043/171] changed arguments for hipPointerGetAttributes Change-Id: Ia7a7c4722c1f7d0a23f0e5cc3dd6dea6c01c1fd8 --- include/hip/hcc_detail/hip_runtime_api.h | 4 ++-- src/hip_memory.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index f9bfb5a310..80a0db7e2e 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -853,7 +853,7 @@ hipError_t hipEventQuery(hipEvent_t event) ; * * @see hipGetDeviceCount, hipGetDevice, hipSetDevice, hipChooseDevice */ -hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr); +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void* ptr); /** * @brief Allocate memory on the default accelerator @@ -1922,7 +1922,7 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); * @param [in] blockDimZ Z grid dimension specified in work-items * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED. * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. - * @param [in] kernelParams + * @param [in] kernelParams * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. * * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index da5530349f..f7421f9818 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -133,7 +133,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig //_appAllocationFlags : These are flags provided by the user when allocation is performed. They are returned to user in hipHostGetFlags and other APIs. // TODO - add more info here when available. // -hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void* ptr) { HIP_INIT_API(attributes, ptr); @@ -1268,7 +1268,7 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr); if(hsa_status != HSA_STATUS_SUCCESS) hipStatus = hipErrorMapBufferObjectFailed; - } + } #else hipStatus = hipErrorRuntimeOther; #endif From 59ab3659eefa03aea48934ff898d4d0d320a0bc8 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Mon, 24 Apr 2017 15:31:07 -0500 Subject: [PATCH 044/171] fixed build issues with hipPointerGetAttributes Change-Id: I3f5fbc05bdaef720884ba949075928752a070377 --- src/hip_memory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index f7421f9818..b706426efb 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -149,10 +149,10 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void attributes->devicePointer = amPointerInfo._devicePointer; attributes->isManaged = 0; if(attributes->memoryType == hipMemoryTypeHost){ - attributes->hostPointer = ptr; + attributes->hostPointer = (void*)ptr; } if(attributes->memoryType == hipMemoryTypeDevice){ - attributes->devicePointer = ptr; + attributes->devicePointer = (void*)ptr; } attributes->allocationFlags = amPointerInfo._appAllocationFlags; attributes->device = amPointerInfo._appId; From 59df70662a39af5d6e136af5e3aa0f0d6bebeb51 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 11:02:38 -0500 Subject: [PATCH 045/171] Fix hipMalloc to return error code if allocation fails. --- include/hip/hcc_detail/hip_runtime_api.h | 2 +- src/hip_memory.cpp | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 80a0db7e2e..7a99ff0810 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -863,7 +863,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void * * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. * - * @return #hipSuccess + * @return #hipSuccess, #hipErrorMemoryAllocation, #hipErrorInvalidValue (bad context, null *ptr) * * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipHostFree, hipHostMalloc */ diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index b706426efb..821f64bc76 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -207,22 +207,26 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) HIP_INIT_API(ptr, sizeBytes); HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; + + auto ctx = ihipGetTlsDefaultCtx(); // return NULL pointer when malloc size is 0 if (sizeBytes == 0) { *ptr = NULL; - return ihipLogStatus(hipSuccess); - } + hip_status = hipSuccess; - auto ctx = ihipGetTlsDefaultCtx(); + } else if ((ctx==nullptr) || (ptr == nullptr)) { + hip_status = hipErrorInvalidValue; - if (ctx) { + } else { auto device = ctx->getWriteableDevice(); *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/); - } else { - hip_status = hipErrorMemoryAllocation; - } + if(sizeBytes && (*ptr == NULL)){ + hip_status = hipErrorMemoryAllocation; + } + + } return ihipLogStatus(hip_status); From 9e41e3c6e2bd8e6aea862f7d6f73be5a5f9bba89 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 11:03:32 -0500 Subject: [PATCH 046/171] Fix hip debug for case where copyAgent is null (host-to-host) --- src/hip_hcc.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 35a3e11e71..080d700e63 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1765,20 +1765,24 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, if (HIP_FORCE_P2P_HOST & 0x1) { *forceUnpinnedCopy = true; - tprintf (DB_COPY, "P2P. Copy engine (dev:%d agent=0x%lx) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + tprintf (DB_COPY, "Copy engine (dev:%d agent=0x%lx) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); } else { - tprintf (DB_COPY, "P2P. Copy engine (dev:%d agent=0x%lx) can see src and dst.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + tprintf (DB_COPY, "Copy engine (dev:%d agent=0x%lx) can see src and dst.\n", + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); } } else { *forceUnpinnedCopy = true; tprintf (DB_COPY, "P2P: Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); if (HIP_FAIL_SOC & 0x2) { fprintf (stderr, "HIP_FAIL_SOC: P2P: copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); throw ihipException(hipErrorRuntimeOther); } } From dc001ef9b41ee8ae3125c7acacaffc5ff71414de Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 11:06:54 -0500 Subject: [PATCH 047/171] Add negative testing for memory full condition. --- tests/src/runtimeApi/memory/hipMemoryAllocate.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index 1f7599491a..0a256d6362 100644 --- a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -56,5 +56,15 @@ int main(){ HIPCHECK_API(hipFree(NULL) , hipSuccess); HIPCHECK_API(hipHostFree(NULL) , hipSuccess); + + { + // Some negative testing - request a too-big allocation and verify it fails: + // Someday when we support virtual memory may need to refactor these: + size_t tooBig = 128LL*1024*1024*1024*1024; // 128 TB; + void *p; + HIPCHECK_API ( hipMalloc(&p, tooBig), hipErrorMemoryAllocation ); + HIPCHECK_API ( hipHostMalloc(&p, tooBig), hipErrorMemoryAllocation ); + } + passed(); } From 2bc0a6030e8fcabfa379ccceba8017648c0e52a3 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 11:53:31 -0500 Subject: [PATCH 048/171] Refactor hipMemcpy test to share mem alloc for multiple copies. --- .vimrc | 1 - tests/src/runtimeApi/memory/hipMemcpy.cpp | 239 ++++++++++++++++------ 2 files changed, 182 insertions(+), 58 deletions(-) delete mode 100644 .vimrc diff --git a/.vimrc b/.vimrc deleted file mode 100644 index 019afa57e6..0000000000 --- a/.vimrc +++ /dev/null @@ -1 +0,0 @@ -:set makeprg=make\ -C\ build.hcc-LC.db diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp index a320a86022..d50a810a58 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -38,6 +38,130 @@ void printSep() printf ("======================================================================================\n"); } +//------- +template +class DeviceMemory +{ +public: + DeviceMemory(size_t numElements); + ~DeviceMemory(); +public: + T * A_d; + T* B_d; + T* C_d; + T* C_dd; + + size_t _maxNumElements; +}; + +template +DeviceMemory::DeviceMemory(size_t numElements) + : _maxNumElements(numElements) +{ + T ** np = nullptr; + HipTest::initArrays (&A_d, &B_d, &C_d, np, np, np, numElements, 0); + + + size_t sizeElements = numElements * sizeof(T); + + + HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); +} + + +template +DeviceMemory::~DeviceMemory () +{ + T * np = nullptr; + HipTest::freeArrays (A_d, B_d, C_d, np, np, np, 0); + + HIPCHECK (hipFree(C_dd)); + + C_dd = NULL; +}; + + + +//------- +template +class HostMemory +{ +public: + HostMemory(size_t numElements, bool usePinnedHost); + void reset(size_t numElements, bool full=false) ; + ~HostMemory(); +public: + // Host arrays + T * A_h; + T* B_h; + T* C_h; + + // Host arrays, secondary copy + T * A_hh; + T* B_hh; + + size_t _maxNumElements; + bool _usePinnedHost; +}; + +template +HostMemory::HostMemory(size_t numElements, bool usePinnedHost) + : _maxNumElements(numElements), + _usePinnedHost(usePinnedHost) +{ + T ** np = nullptr; + HipTest::initArrays (np, np, np, &A_h, &B_h, &C_h, numElements, usePinnedHost); + + A_hh = NULL; + B_hh = NULL; + + + size_t sizeElements = numElements * sizeof(T); + + if (usePinnedHost) { + HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) ); + HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) ); + } else { + A_hh = (T*)malloc(sizeElements); + B_hh = (T*)malloc(sizeElements); + } + +} + + +template +void +HostMemory::reset(size_t numElements, bool full) +{ + // Initialize the host data: + for (size_t i=0; i +HostMemory::~HostMemory () +{ + HipTest::freeArraysForHost (A_h, B_h, C_h, _usePinnedHost); + + if (_usePinnedHost) { + HIPCHECK (hipHostFree(A_hh)); + HIPCHECK (hipHostFree(B_hh)); + + } else { + free(A_hh); + free(B_hh); + } + T *A_hh = NULL; + T *B_hh = NULL; + +}; @@ -52,71 +176,55 @@ void printSep() // IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. // template -void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); - T *A_d, *B_d, *C_d; - T *A_h, *B_h, *C_h; - - - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); + hmem->reset(numElements); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - T *A_hh = NULL; - T *B_hh = NULL; - T *C_dd = NULL; + assert (numElements <= dmem->_maxNumElements); + assert (numElements <= hmem->_maxNumElements); if (useHostToHost) { - if (usePinnedHost) { - HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) ); - HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) ); - } else { - A_hh = (T*)malloc(sizeElements); - B_hh = (T*)malloc(sizeElements); - } - - // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(hmem->A_hh, hmem->A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { - HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d, dmem->B_d, dmem->C_d, numElements); if (useDeviceToDevice) { - HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + // Do an extra device-to-device copy here to mix things up: + HIPCHECK ( hipMemcpy(dmem->C_dd, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); - // Do an extra device-to-device copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + //Destroy the original dmem->C_d: + HIPCHECK ( hipMemset(dmem->C_d, 0x5A, sizeElements)); - //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); - - HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { - HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(A_h, B_h, C_h, numElements); + HipTest::checkVectorADD(hmem->A_h, hmem->B_h, hmem->C_h, numElements); + - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); } @@ -129,11 +237,15 @@ void memcpytest2_for_type(size_t numElements) { printSep(); + DeviceMemory memD(numElements); + HostMemory memU(numElements, 0/*usePinnedHost*/); + HostMemory memP(numElements, 1/*usePinnedHost*/); + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + memcpytest2(&memD, usePinnedHost ? &memP : &memU, numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); } } } @@ -156,17 +268,19 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) HIPCHECK(hipMemGetInfo(&free, &total)); if (maxElem == 0) { - maxElem = free/sizeof(T)/5; + maxElem = free/sizeof(T)/20; } printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + HIPCHECK ( hipDeviceReset() ); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 1/*usePinnedHost*/); for (size_t elem=64; elem+offset<=maxElem; elem*=2) { - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host + memcpytest2(&memD, &memU, elem+offset, 0, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem+offset, 1, 1, 1, 0); // pinned host } } @@ -178,13 +292,17 @@ void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); + DeviceMemory memD(N); + HostMemory mem1(N, usePinnedHost); + HostMemory mem2(N, usePinnedHost); + + std::thread t1 (memcpytest2, &memD, &mem1, N, usePinnedHost,0,0,0); if (serialize) { t1.join(); } - std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); + std::thread t2 (memcpytest2,&memD, &mem2, N, usePinnedHost,0,0,0); if (serialize) { t2.join(); } @@ -218,24 +336,30 @@ int main(int argc, char *argv[]) if (p_tests & 0x2) { - // Some tests around the 64MB boundary which have historically shown issues: - printf ("\n\n=== tests&0x2 (64MB boundary)\n"); -#if 0 + // Some tests around the 64KB boundary which have historically shown issues: + printf ("\n\n=== tests&0x2 (64KB boundary)\n"); + size_t maxElem = 32*1024*1024; + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 0/*usePinnedHost*/); // These all pass: - memcpytest2(15*1024*1024, 1, 0, 0, 0); - memcpytest2(16*1024*1024, 1, 0, 0, 0); - memcpytest2(16*1024*1024+16*1024, 1, 0, 0, 0); -#endif + memcpytest2(&memD, &memP, 15*1024*1024, 1, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024, 1, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 1, 0, 0, 0); + // Just over 64MB: - memcpytest2(16*1024*1024+512*1024, 1, 0, 0, 0); - memcpytest2(17*1024*1024+1024, 1, 0, 0, 0); - memcpytest2(32*1024*1024, 1, 0, 0, 0); - memcpytest2(32*1024*1024, 0, 0, 0, 0); - memcpytest2(32*1024*1024, 1, 1, 1, 0); - memcpytest2(32*1024*1024, 1, 1, 1, 0); + memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 1, 0, 0, 0); + memcpytest2(&memD, &memP, 17*1024*1024+1024, 1, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 0, 0, 0); + memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 1, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 1, 0); + + } + if (p_tests & 0x4) { printf ("\n\n=== tests&4 (test sizes and offsets)\n"); HIPCHECK ( hipDeviceReset() ); @@ -270,6 +394,7 @@ int main(int argc, char *argv[]) } + passed(); } From c5d89d9e7fe8b4b03e16908dc2eb0974d7f6ce04 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 12:51:17 -0500 Subject: [PATCH 049/171] Add corrected test for offsets --- tests/src/runtimeApi/memory/hipMemcpy.cpp | 168 +++++++++++++++------- 1 file changed, 115 insertions(+), 53 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp index d50a810a58..ad798d70c1 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -24,6 +24,7 @@ THE SOFTWARE. * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 * RUN_NAMED: %t hipMemcpy-size --tests 0x6 + * RUN_NAMED: %t hipMemcpy-offsets --tests 0x10 * RUN_NAMED: %t hipMemcpy-multithreaded --tests 0x8 * HIT_END */ @@ -45,27 +46,42 @@ class DeviceMemory public: DeviceMemory(size_t numElements); ~DeviceMemory(); -public: - T * A_d; - T* B_d; - T* C_d; - T* C_dd; + + T *A_d() const { return _A_d + _offset; }; + T *B_d() const { return _B_d + _offset; }; + T *C_d() const { return _C_d + _offset; }; + T *C_dd() const { return _C_dd + _offset; }; + + size_t maxNumElements() const { return _maxNumElements; }; + + + void offset(int offset) { _offset = offset; }; + int offset() const { return _offset; }; + +private: + T * _A_d; + T* _B_d; + T* _C_d; + T* _C_dd; + size_t _maxNumElements; + int _offset; }; template DeviceMemory::DeviceMemory(size_t numElements) - : _maxNumElements(numElements) + : _maxNumElements(numElements), + _offset(0) { T ** np = nullptr; - HipTest::initArrays (&A_d, &B_d, &C_d, np, np, np, numElements, 0); + HipTest::initArrays (&_A_d, &_B_d, &_C_d, np, np, np, numElements, 0); size_t sizeElements = numElements * sizeof(T); - HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + HIPCHECK ( hipMalloc(&_C_dd, sizeElements) ); } @@ -73,11 +89,11 @@ template DeviceMemory::~DeviceMemory () { T * np = nullptr; - HipTest::freeArrays (A_d, B_d, C_d, np, np, np, 0); + HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0); - HIPCHECK (hipFree(C_dd)); + HIPCHECK (hipFree(_C_dd)); - C_dd = NULL; + _C_dd = NULL; }; @@ -90,6 +106,8 @@ public: HostMemory(size_t numElements, bool usePinnedHost); void reset(size_t numElements, bool full=false) ; ~HostMemory(); + + size_t maxNumElements() const { return _maxNumElements; }; public: // Host arrays T * A_h; @@ -176,21 +194,22 @@ HostMemory::~HostMemory () // IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. // template -void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:%+d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, - hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault, + dmem->offset()); hmem->reset(numElements); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - assert (numElements <= dmem->_maxNumElements); - assert (numElements <= hmem->_maxNumElements); + assert (numElements <= dmem->maxNumElements()); + assert (numElements <= hmem->maxNumElements()); @@ -200,25 +219,25 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { - HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d, dmem->B_d, dmem->C_d, numElements); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements); if (useDeviceToDevice) { // Do an extra device-to-device copy here to mix things up: - HIPCHECK ( hipMemcpy(dmem->C_dd, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + HIPCHECK ( hipMemcpy(dmem->C_dd(), dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); - //Destroy the original dmem->C_d: - HIPCHECK ( hipMemset(dmem->C_d, 0x5A, sizeElements)); + //Destroy the original dmem->C_d(): + HIPCHECK ( hipMemset(dmem->C_d(), 0x5A, sizeElements)); - HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { - HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); @@ -245,7 +264,7 @@ void memcpytest2_for_type(size_t numElements) for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(&memD, usePinnedHost ? &memP : &memU, numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + memcpytest2(&memD, usePinnedHost ? &memP : &memU, numElements, useHostToHost, useDeviceToDevice, useMemkindDefault); } } } @@ -256,7 +275,7 @@ void memcpytest2_for_type(size_t numElements) //--- //Try many different sizes to memory copy. template -void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) +void memcpytest2_sizes(size_t maxElem=0) { printSep(); printf ("test: %s<%s>\n", __func__, TYPENAME(T)); @@ -268,19 +287,59 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) HIPCHECK(hipMemGetInfo(&free, &total)); if (maxElem == 0) { - maxElem = free/sizeof(T)/20; + maxElem = free/sizeof(T)/5; } - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", - deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); DeviceMemory memD(maxElem); HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); - for (size_t elem=64; elem+offset<=maxElem; elem*=2) { - memcpytest2(&memD, &memU, elem+offset, 0, 1, 1, 0); // unpinned host - memcpytest2(&memD, &memP, elem+offset, 1, 1, 1, 0); // pinned host + for (size_t elem=1; elem<=maxElem; elem*=2) { + memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host + } +} + + +//--- +//Try many different sizes to memory copy. +template +void memcpytest2_offsets(size_t maxElem) +{ + printSep(); + printf ("test: %s<%s>\n", __func__, TYPENAME(T)); + + int deviceId; + HIPCHECK(hipGetDevice(&deviceId)); + + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + + + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); + HIPCHECK ( hipDeviceReset() ); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 1/*usePinnedHost*/); + + size_t elem = maxElem / 2; + + for (int offset=0; offset < 512; offset++) { + assert (elem + offset < maxElem); + memD.offset(offset); + memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host + } + + for (int offset=512; offset < maxElem; offset*=2) { + assert (elem + offset < maxElem); + memD.offset(offset); + memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host } } @@ -296,13 +355,13 @@ void multiThread_1(bool serialize, bool usePinnedHost) HostMemory mem1(N, usePinnedHost); HostMemory mem2(N, usePinnedHost); - std::thread t1 (memcpytest2, &memD, &mem1, N, usePinnedHost,0,0,0); + std::thread t1 (memcpytest2, &memD, &mem1, N, 0,0,0); if (serialize) { t1.join(); } - std::thread t2 (memcpytest2,&memD, &mem2, N, usePinnedHost,0,0,0); + std::thread t2 (memcpytest2,&memD, &mem2, N, 0,0,0); if (serialize) { t2.join(); } @@ -343,17 +402,17 @@ int main(int argc, char *argv[]) HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 0/*usePinnedHost*/); // These all pass: - memcpytest2(&memD, &memP, 15*1024*1024, 1, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024, 1, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 1, 0, 0, 0); + memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); // Just over 64MB: - memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 1, 0, 0, 0); - memcpytest2(&memD, &memP, 17*1024*1024+1024, 1, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 0, 0, 0); - memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 1, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 1, 0); + memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); } @@ -361,16 +420,19 @@ int main(int argc, char *argv[]) if (p_tests & 0x4) { - printf ("\n\n=== tests&4 (test sizes and offsets)\n"); + printf ("\n\n=== tests&4 (test sizes)\n"); HIPCHECK ( hipDeviceReset() ); + memcpytest2_sizes(0); printSep(); - memcpytest2_sizes(0,0); - printSep(); - memcpytest2_sizes(0,64); - printSep(); - memcpytest2_sizes(1024*1024, 13); - printSep(); - memcpytest2_sizes(1024*1024, 50); + } + + + if (p_tests & 0x10) { + printf ("\n\n=== tests&4 (test offsets)\n"); + HIPCHECK ( hipDeviceReset() ); + memcpytest2_offsets(256*1024*1024); + memcpytest2_offsets(256*1024*1024); + memcpytest2_offsets(256*1024*1024); } if (p_tests & 0x8) { From ab410add5cb253823fc21b55fb31635171c5af71 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 16:02:22 -0500 Subject: [PATCH 050/171] Add test for non-page-aligned mem copies. --- tests/src/runtimeApi/memory/hipMemcpy.cpp | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp index ad798d70c1..c48f780e44 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -24,7 +24,7 @@ THE SOFTWARE. * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 * RUN_NAMED: %t hipMemcpy-size --tests 0x6 - * RUN_NAMED: %t hipMemcpy-offsets --tests 0x10 + * RUN_NAMED: %t hipMemcpy-dev_offsets --tests 0x10 * RUN_NAMED: %t hipMemcpy-multithreaded --tests 0x8 * HIT_END */ @@ -335,7 +335,7 @@ void memcpytest2_offsets(size_t maxElem) memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host } - for (int offset=512; offset < maxElem; offset*=2) { + for (int offset=512; offset < elem; offset*=2) { assert (elem + offset < maxElem); memD.offset(offset); memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host @@ -427,13 +427,6 @@ int main(int argc, char *argv[]) } - if (p_tests & 0x10) { - printf ("\n\n=== tests&4 (test offsets)\n"); - HIPCHECK ( hipDeviceReset() ); - memcpytest2_offsets(256*1024*1024); - memcpytest2_offsets(256*1024*1024); - memcpytest2_offsets(256*1024*1024); - } if (p_tests & 0x8) { printf ("\n\n=== tests&8\n"); @@ -456,6 +449,16 @@ int main(int argc, char *argv[]) } + if (p_tests & 0x10) { + printf ("\n\n=== tests&0x10 (test device offsets)\n"); + HIPCHECK ( hipDeviceReset() ); + size_t maxSize = 256*1024; + memcpytest2_offsets (maxSize); + memcpytest2_offsets (maxSize); + memcpytest2_offsets(maxSize); + } + + passed(); From 5a52b79782721cc670d5c590822d06190283d550 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 16:55:29 -0500 Subject: [PATCH 051/171] Tailor pointer info for src/dst before calling HCC copy routines. HCC sometimes uses the srcPtrInfo or dstPtrInfo to determine the pointer. Make sure these use the actual pointer and not the base of the allocation. --- src/hip_hcc.cpp | 66 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 080d700e63..71d947488d 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1798,6 +1798,62 @@ void printPointerInfo(unsigned dbFlag, const char *tag, const void *ptr, const h } +// the pointer-info as returned by HC refers to the allocation +// This routine modifies the pointer-info so it appears to refer to the specific ptr and sizeBytes. +// TODO -remove this when HCC uses HSA pointer info functions directly. +void tailorPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes) +{ + const char *ptrc = static_cast (ptr); + if (ptrInfo->_sizeBytes == 0) { + // invalid ptrInfo, don't modify + return; + } else if (ptrInfo->_isInDeviceMem) { + assert (ptrInfo->_devicePointer != nullptr); + std::ptrdiff_t diff = ptrc - static_cast (ptrInfo->_devicePointer); + + //TODO : assert-> runtime assert that only appears in debug mode + assert (diff >= 0); + assert (diff <= ptrInfo->_sizeBytes); + + ptrInfo->_devicePointer = const_cast (ptr); + + if (ptrInfo->_hostPointer != nullptr) { + ptrInfo->_hostPointer = static_cast(ptrInfo->_hostPointer) + diff; + } + + } else { + + assert (ptrInfo->_hostPointer != nullptr); + std::ptrdiff_t diff = ptrc - static_cast (ptrInfo->_hostPointer); + + //TODO : assert-> runtime assert that only appears in debug mode + assert (diff >= 0); + assert (diff <= ptrInfo->_sizeBytes); + + ptrInfo->_hostPointer = const_cast(ptr); + + if (ptrInfo->_devicePointer != nullptr) { + ptrInfo->_devicePointer = static_cast(ptrInfo->_devicePointer) + diff; + } + } + + assert (sizeBytes <= ptrInfo->_sizeBytes); + ptrInfo->_sizeBytes = sizeBytes; +}; + + +bool getTailoredPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes) +{ + bool tracked = (hc::am_memtracker_getinfo(ptrInfo, ptr) == AM_SUCCESS); + + if (tracked) { + tailorPtrInfo(ptrInfo, ptr, sizeBytes); + } + + return tracked; +}; + + // TODO : For registered and host memory, if the portable flag is set, we need to recognize that and perform appropriate copy operation. // What can happen now is that Portable memory is mapped into multiple devices but Peer access is not enabled. i // The peer detection logic doesn't see that the memory is already mapped and so tries to use an unpinned copy algorithm. If this is PinInPlace, then an error can occur. @@ -1816,8 +1872,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, hc::accelerator acc; hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); - bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); - bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes); + bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes); // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not valid, so check it here: @@ -1877,6 +1933,7 @@ void ihipStream_t::lockedSymbolCopySync(hc::accelerator &acc, void* dst, void* s void ihipStream_t::lockedSymbolCopyAsync(hc::accelerator &acc, void* dst, void* src, size_t sizeBytes, size_t offset, unsigned kind) { + // TODO - review - this looks broken , should not be adding pointers to tracker dynamically: if(kind == hipMemcpyHostToDevice) { hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); @@ -1903,6 +1960,7 @@ void ihipStream_t::lockedSymbolCopyAsync(hc::accelerator &acc, void* dst, void* } } + void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind) { @@ -1930,8 +1988,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes hc::accelerator acc; hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); - bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); - bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes); + bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes); hc::hcCommandKind hcCopyDir; From 8ba993e36c60275ef23324513f5cb09b95103c81 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 20:38:37 -0500 Subject: [PATCH 052/171] Refactor hipHostRegister to cover misaligned cases. --- .../src/runtimeApi/memory/hipHostRegister.cpp | 129 ++++++++++++------ 1 file changed, 84 insertions(+), 45 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipHostRegister.cpp b/tests/src/runtimeApi/memory/hipHostRegister.cpp index 1a1319c500..efa23b4068 100644 --- a/tests/src/runtimeApi/memory/hipHostRegister.cpp +++ b/tests/src/runtimeApi/memory/hipHostRegister.cpp @@ -19,87 +19,126 @@ THE SOFTWARE. /* HIT_START * BUILD: %t %s ../../test_common.cpp - * RUN: %t + * RUN: %t --tests 0x1 + * RUN: %t --tests 0x2 * HIT_END */ +// TODO - bug if run both back-to-back + #include"test_common.h" #include __global__ void Inc(hipLaunchParm lp, float *Ad){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; -Ad[tx] = Ad[tx] + float(1); + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + Ad[tx] = Ad[tx] + float(1); } -int main(){ - float *A, **Ad; - int num_devices; - HIPCHECK(hipGetDeviceCount(&num_devices)); - Ad = new float*[num_devices]; - const size_t size = N * sizeof(float); - A = (float*)malloc(size); - HIPCHECK(hipHostRegister(A, size, 0)); + +template +void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd) +{ + A = A + offset; + numElements -= offset; + + size_t sizeBytes = numElements * sizeof(T); + + HIPCHECK(hipHostRegister(A, sizeBytes, 0)); - for(int i=0;iOFFSETS_TO_TRY); + for (size_t i=0; i Date: Mon, 24 Apr 2017 21:05:29 -0500 Subject: [PATCH 053/171] Refactor hipHostRegister test. - Add more testing for offsets. - Parse cmdline options and use --tests. --- .../src/runtimeApi/memory/hipHostRegister.cpp | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipHostRegister.cpp b/tests/src/runtimeApi/memory/hipHostRegister.cpp index efa23b4068..8cf0979261 100644 --- a/tests/src/runtimeApi/memory/hipHostRegister.cpp +++ b/tests/src/runtimeApi/memory/hipHostRegister.cpp @@ -21,10 +21,11 @@ THE SOFTWARE. * BUILD: %t %s ../../test_common.cpp * RUN: %t --tests 0x1 * RUN: %t --tests 0x2 + * RUN: %t --tests 0x4 * HIT_END */ -// TODO - bug if run both back-to-back +// TODO - bug if run both back-to-back, once fixed should just need one command line #include"test_common.h" #include @@ -36,14 +37,16 @@ __global__ void Inc(hipLaunchParm lp, float *Ad){ template -void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd) +void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister) { A = A + offset; numElements -= offset; size_t sizeBytes = numElements * sizeof(T); - HIPCHECK(hipHostRegister(A, sizeBytes, 0)); + if (internalRegister) { + HIPCHECK(hipHostRegister(A, sizeBytes, 0)); + } // Reset @@ -67,7 +70,9 @@ void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd) }; } - HIPCHECK(hipHostUnregister(A)); + if (internalRegister) { + HIPCHECK(hipHostUnregister(A)); + } } @@ -112,7 +117,7 @@ int main(int argc, char *argv[]) } - if (p_tests & 0x2) { + if (p_tests & 0x6) { // Sensitize HIP bug if device does not match where the memory was registered. HIPCHECK(hipSetDevice(0)); @@ -125,11 +130,22 @@ int main(int argc, char *argv[]) Bh = (float*)malloc(size); HIPCHECK(hipMalloc(&Bd, size)); - // TODO - change to 256: + // TODO - set to 128 #define OFFSETS_TO_TRY 1 assert (N>OFFSETS_TO_TRY); - for (size_t i=0; i Date: Mon, 24 Apr 2017 21:22:56 -0500 Subject: [PATCH 054/171] Add host offset checking --- tests/src/runtimeApi/memory/hipMemcpy.cpp | 91 ++++++++++++++++------- 1 file changed, 65 insertions(+), 26 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp index c48f780e44..749ec0de77 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -24,7 +24,8 @@ THE SOFTWARE. * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 * RUN_NAMED: %t hipMemcpy-size --tests 0x6 - * RUN_NAMED: %t hipMemcpy-dev_offsets --tests 0x10 + * RUN_NAMED: %t hipMemcpy-dev-offsets --tests 0x10 + * RUN_NAMED: %t hipMemcpy-host-offsets --tests 0x20 * RUN_NAMED: %t hipMemcpy-multithreaded --tests 0x8 * HIT_END */ @@ -107,28 +108,43 @@ public: void reset(size_t numElements, bool full=false) ; ~HostMemory(); + + T *A_h() const { return _A_h + _offset; }; + T *B_h() const { return _B_h + _offset; }; + T *C_h() const { return _C_h + _offset; }; + + + size_t maxNumElements() const { return _maxNumElements; }; + + void offset(int offset) { _offset = offset; }; + int offset() const { return _offset; }; public: - // Host arrays - T * A_h; - T* B_h; - T* C_h; // Host arrays, secondary copy T * A_hh; T* B_hh; - size_t _maxNumElements; bool _usePinnedHost; +private: + size_t _maxNumElements; + + int _offset; + + // Host arrays + T * _A_h; + T* _B_h; + T* _C_h; }; template HostMemory::HostMemory(size_t numElements, bool usePinnedHost) : _maxNumElements(numElements), - _usePinnedHost(usePinnedHost) + _usePinnedHost(usePinnedHost), + _offset(0) { T ** np = nullptr; - HipTest::initArrays (np, np, np, &A_h, &B_h, &C_h, numElements, usePinnedHost); + HipTest::initArrays (np, np, np, &_A_h, &_B_h, &_C_h, numElements, usePinnedHost); A_hh = NULL; B_hh = NULL; @@ -157,8 +173,8 @@ HostMemory::reset(size_t numElements, bool full) (B_hh)[i] = 1492.0 + i; // Phi if (full) { - (A_h)[i] = 3.146f + i; // Pi - (B_h)[i] = 1.618f + i; // Phi + (_A_h)[i] = 3.146f + i; // Pi + (_B_h)[i] = 1.618f + i; // Phi } } } @@ -166,7 +182,7 @@ HostMemory::reset(size_t numElements, bool full) template HostMemory::~HostMemory () { - HipTest::freeArraysForHost (A_h, B_h, C_h, _usePinnedHost); + HipTest::freeArraysForHost (_A_h, _B_h, _C_h, _usePinnedHost); if (_usePinnedHost) { HIPCHECK (hipHostFree(A_hh)); @@ -197,12 +213,13 @@ template void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:%+d\n", + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault, - dmem->offset()); + dmem->offset(), hmem->offset() + ); hmem->reset(numElements); @@ -215,15 +232,15 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, if (useHostToHost) { // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(hmem->A_hh, hmem->A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(hmem->A_hh, hmem->A_h(), sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h(), sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { - HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements); @@ -235,13 +252,13 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, //Destroy the original dmem->C_d(): HIPCHECK ( hipMemset(dmem->C_d(), 0x5A, sizeElements)); - HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h(), dmem->C_dd(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { - HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h(), dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(hmem->A_h, hmem->B_h, hmem->C_h, numElements); + HipTest::checkVectorADD(hmem->A_h(), hmem->B_h(), hmem->C_h(), numElements); @@ -307,7 +324,7 @@ void memcpytest2_sizes(size_t maxElem=0) //--- //Try many different sizes to memory copy. template -void memcpytest2_offsets(size_t maxElem) +void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) { printSep(); printf ("test: %s<%s>\n", __func__, TYPENAME(T)); @@ -330,14 +347,26 @@ void memcpytest2_offsets(size_t maxElem) for (int offset=0; offset < 512; offset++) { assert (elem + offset < maxElem); - memD.offset(offset); + if (devOffsets) { + memD.offset(offset); + } + if (hostOffsets) { + memU.offset(offset); + memP.offset(offset); + } memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host } for (int offset=512; offset < elem; offset*=2) { assert (elem + offset < maxElem); - memD.offset(offset); + if (devOffsets) { + memD.offset(offset); + } + if (hostOffsets) { + memU.offset(offset); + memP.offset(offset); + } memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host } @@ -453,9 +482,19 @@ int main(int argc, char *argv[]) printf ("\n\n=== tests&0x10 (test device offsets)\n"); HIPCHECK ( hipDeviceReset() ); size_t maxSize = 256*1024; - memcpytest2_offsets (maxSize); - memcpytest2_offsets (maxSize); - memcpytest2_offsets(maxSize); + memcpytest2_offsets (maxSize, true, false); + memcpytest2_offsets (maxSize, true, false); + memcpytest2_offsets(maxSize, true, false); + } + + + if (p_tests & 0x20) { + printf ("\n\n=== tests&0x10 (test device offsets)\n"); + HIPCHECK ( hipDeviceReset() ); + size_t maxSize = 256*1024; + memcpytest2_offsets (maxSize, false, true); + memcpytest2_offsets (maxSize, false, true); + memcpytest2_offsets(maxSize, false, true); } From 54561c8af3ac3052fefa6d8c8f6b9d18274d08fa Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Tue, 25 Apr 2017 00:13:32 -0500 Subject: [PATCH 055/171] fix hip_complex.h header on NV path Change-Id: Ia95d003ca1b284bab1c76723050e6b3b89178f65 --- include/hip/nvcc_detail/hip_complex.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/hip/nvcc_detail/hip_complex.h b/include/hip/nvcc_detail/hip_complex.h index 84afb13e50..20cb24460c 100644 --- a/include/hip/nvcc_detail/hip_complex.h +++ b/include/hip/nvcc_detail/hip_complex.h @@ -64,7 +64,7 @@ __device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hi } __device__ __host__ static inline float hipCabsf(hipFloatComplex z){ - return cuCabsf(p, q); + return cuCabsf(z); } typedef cuDoubleComplex hipDoubleComplex; @@ -85,7 +85,7 @@ __device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){ return cuConj(z); } -__device__ __host__ static inline hipDoubleComplex hipCsqabs(hipDoubleComplex z){ +__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z){ return cuCabs(z) * cuCabs(z); } @@ -123,7 +123,7 @@ __device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q return cuCfmaf(p, q, r); } -__device__ __host__ static inline hipDoubleComplex hipCfma(hipComplex p, hipComplex q, hipComplex r){ +__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){ return cuCfma(p, q, r); } From fc6248ce82cb62032ed7138e659d520ba59f95f6 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 26 Apr 2017 18:56:57 -0500 Subject: [PATCH 056/171] added hipFuncSetCacheConfig API for nvcc path Change-Id: I87fae35bc0e10a0dca5ae1c5015fe5d9e52a1d0d --- include/hip/nvcc_detail/hip_runtime_api.h | 4 +++ .../module/hipFuncSetCacheConfig.cpp | 36 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index 0cc40f32af..4feefcc342 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -897,6 +897,10 @@ inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, } +inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) +{ + return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig)); +} #ifdef __cplusplus } diff --git a/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp b/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp new file mode 100644 index 0000000000..e3c3efad3d --- /dev/null +++ b/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp @@ -0,0 +1,36 @@ +/* +Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + +#include +#include +#include"test_common.h" + +int main(){ + hipFuncCache_t cacheConfig; + void *func; + hipFuncSetCacheConfig(func, cacheConfig); + passed(); +} + From ab2eb420e2bfdfd1f5972a4341b5fb4ccf41c24a Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 26 Apr 2017 19:01:10 -0500 Subject: [PATCH 057/171] fixed fast math expf and exp10f Change-Id: I73963220f902efebb0a7404c5f8966dffb4c35ca --- src/device_util.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/device_util.cpp b/src/device_util.cpp index 8ce53765b5..b730412874 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -1163,18 +1163,18 @@ __device__ double __hip_precise_dsqrt_rz(double x) { return hc::precise_math::sqrt(x); } -#define LOG_BASE2_E_DIV_2 0.4426950408894701 -#define LOG_BASE2_5 2.321928094887362 +#define LOG_BASE2_E 1.4426950408889634 +#define LOG_BASE2_10 3.32192809488736 #define ONE_DIV_LOG_BASE2_E 0.69314718056 #define ONE_DIV_LOG_BASE2_10 0.30102999566 // Fast Math Intrinsics __device__ float __hip_fast_exp10f(float x) { - return __hip_fast_exp2f(x*LOG_BASE2_E_DIV_2); + return __hip_fast_exp2f(x*LOG_BASE2_E); } __device__ float __hip_fast_expf(float x) { - return __hip_fast_expf(x*LOG_BASE2_5); + return __hip_fast_exp2f(x*LOG_BASE2_10); } __device__ float __hip_fast_frsqrt_rn(float x) { From a5cb2d40ec2e8684ee57faa709b293d84eef4459 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 28 Apr 2017 11:53:11 -0500 Subject: [PATCH 058/171] fixed hipFuncSetCacheConfig on rocm path Change-Id: I937a3afbf115edc94a753a0beb2230ed60a6f021 --- include/hip/hcc_detail/hip_runtime_api.h | 2 +- src/hip_device.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 7a99ff0810..6917f04f96 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -385,7 +385,7 @@ hipError_t hipDeviceGetLimit(size_t *pValue, enum hipLimit_t limit); * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures. * */ -hipError_t hipFuncSetCacheConfig ( hipFuncCache_t config ); +hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t config ); /** * @brief Returns bank width of shared memory for current device diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 88d94411e8..01a213190f 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -112,7 +112,7 @@ hipError_t hipDeviceGetLimit (size_t *pValue, hipLimit_t limit) } } -hipError_t hipFuncSetCacheConfig (hipFuncCache_t cacheConfig) +hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t cacheConfig) { HIP_INIT_API(cacheConfig); From 3d88932c8de55de0e44df5f324899f87645c876c Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Apr 2017 21:03:03 +0300 Subject: [PATCH 059/171] [HIPIFY] [FIX] replacement error: cudaError_t -> hipError_t_t https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/71 [Solution] getUnqualifiedType for enumConstantDecl's type is added, except ordinary enum declarations (w/o typedef). [ToDo] Find more appropriate way of distinguishing redefined enum declarations and ordinary ones. --- hipify-clang/src/Cuda2Hip.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 5a2940322e..f0fa8331dc 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -191,7 +191,7 @@ struct cuda2hipMap { // Error codes and return types cuda2hipRename["CUresult"] = {"hipError_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["cudaError_enum"] = {"hipError_t", CONV_TYPE, API_DRIVER}; +// cuda2hipRename["cudaError_enum"] = {"hipError_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["cudaError_t"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; cuda2hipRename["cudaError"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; @@ -2806,12 +2806,11 @@ private: bool cudaEnumConstantDecl(const MatchFinder::MatchResult &Result) { if (const VarDecl *enumConstantDecl = Result.Nodes.getNodeAs("cudaEnumConstantDecl")) { - StringRef name = - enumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); - // anonymous typedef enum - if (name.empty()) { - QualType QT = enumConstantDecl->getType().getUnqualifiedType(); - name = QT.getAsString(); + StringRef name = enumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); + QualType QT = enumConstantDecl->getType().getUnqualifiedType(); + StringRef name_unqualified = QT.getAsString(); + if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { + name = name_unqualified; } SourceLocation sl = enumConstantDecl->getLocStart(); SourceManager *SM = Result.SourceManager; From eddd02199620f12bea2bb3f3f94f896ad788ccf3 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Apr 2017 21:45:36 +0300 Subject: [PATCH 060/171] * [HIPIFY] [FIX] Replacement error: enum cudaMemcpyKind kind -> hipMemcpyKindyKind kind https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/72 [Solution] [Workaround] Offset calculation for enum VarDecl as param decl, declared with enum type specifier. [Result] enum cudaMemcpyKind kind -> enum hipMemcpyKind kind [ToDo] Test on terminal qualifiers (const, etc). --- hipify-clang/src/Cuda2Hip.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index f0fa8331dc..35c930f3af 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -2812,8 +2812,19 @@ private: if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { name = name_unqualified; } - SourceLocation sl = enumConstantDecl->getLocStart(); + // Workaround for enum VarDecl as param decl, declared with enum type specifier + // Example: void func(enum cudaMemcpyKind kind); + //------------------------------------------------- SourceManager *SM = Result.SourceManager; + SourceLocation sl(enumConstantDecl->getLocStart()); + SourceLocation end(enumConstantDecl->getLocEnd()); + size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl); + StringRef sfull = StringRef(SM->getCharacterData(sl), repLength); + size_t offset = sfull.find(name); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + //------------------------------------------------- const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { updateCounters(found->second, name.str()); From c7958cbb8b3c3a59921ec5ac5a669bc4430b2ee5 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Apr 2017 21:59:33 +0300 Subject: [PATCH 061/171] [HIPIFY] Rename enumConstantDecl -> enumDecl Reason: not to mix up with clang's enumConstantDecl, used for enum DeclRefExpr (enum constant). --- hipify-clang/src/Cuda2Hip.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 35c930f3af..390b4ee88c 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -2804,10 +2804,10 @@ private: return false; } - bool cudaEnumConstantDecl(const MatchFinder::MatchResult &Result) { - if (const VarDecl *enumConstantDecl = Result.Nodes.getNodeAs("cudaEnumConstantDecl")) { - StringRef name = enumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); - QualType QT = enumConstantDecl->getType().getUnqualifiedType(); + bool cudaEnumDecl(const MatchFinder::MatchResult &Result) { + if (const VarDecl *enumDecl = Result.Nodes.getNodeAs("cudaEnumDecl")) { + StringRef name = enumDecl->getType()->getAsTagDecl()->getNameAsString(); + QualType QT = enumDecl->getType().getUnqualifiedType(); StringRef name_unqualified = QT.getAsString(); if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { name = name_unqualified; @@ -2816,8 +2816,8 @@ private: // Example: void func(enum cudaMemcpyKind kind); //------------------------------------------------- SourceManager *SM = Result.SourceManager; - SourceLocation sl(enumConstantDecl->getLocStart()); - SourceLocation end(enumConstantDecl->getLocEnd()); + SourceLocation sl(enumDecl->getLocStart()); + SourceLocation end(enumDecl->getLocEnd()); size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl); StringRef sfull = StringRef(SM->getCharacterData(sl), repLength); size_t offset = sfull.find(name); @@ -3123,7 +3123,7 @@ public: if (cudaCall(Result)) break; if (cudaBuiltin(Result)) break; if (cudaEnumConstantRef(Result)) break; - if (cudaEnumConstantDecl(Result)) break; + if (cudaEnumDecl(Result)) break; if (cudaTypedefVar(Result)) break; if (cudaTypedefVarPtr(Result)) break; if (cudaStructVar(Result)) break; @@ -3169,7 +3169,7 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac Callback); Finder.addMatcher(varDecl(isExpansionInMainFile(), hasType(enumDecl())) - .bind("cudaEnumConstantDecl"), + .bind("cudaEnumDecl"), Callback); Finder.addMatcher(varDecl(isExpansionInMainFile(), hasType(typedefDecl(matchesName("cu.*|CU.*")))) From ec27c695c4ea83804d679dc7d07f947a07ece3c8 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Wed, 3 May 2017 22:29:12 +0530 Subject: [PATCH 062/171] Added support for hipMemcpy2DAsync in HIP/HCC Change-Id: Ia4a8306f2dc1e33a81a7195ec29aef652fcccc4b --- include/hip/hcc_detail/hip_runtime_api.h | 21 +++++++++++++++++++++ src/hip_memory.cpp | 18 ++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 6917f04f96..9cfd21c1d2 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -1308,6 +1308,27 @@ hipError_t hipFreeArray(hipArray* array); */ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind); +/** + * @brief Copies data between host and device. + * + * @param[in] dst Destination memory address + * @param[in] dpitch Pitch of destination memory + * @param[in] src Source memory address + * @param[in] spitch Pitch of source memory + * @param[in] width Width of matrix transfer (columns in bytes) + * @param[in] height Height of matrix transfer (rows) + * @param[in] kind Type of transfer + * @param[in] stream Stream to use + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection + * + * @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync + */ +#if __cplusplus +hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream = 0); +#else +hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream); +#endif + /** * @brief Copies data between host and device. * diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 821f64bc76..c4bc7db096 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -793,6 +793,24 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, return ihipLogStatus(e); } +hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, + size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { + HIP_INIT_CMD_API(dst, dpitch, src, spitch, width, height, kind, stream); + if(width > dpitch || width > spitch) + return ihipLogStatus(hipErrorUnknown); + hipError_t e = hipSuccess; + try { + for(int i = 0; i < height; ++i) { + e = hip_internal::memcpyAsync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind,stream); + } + } + catch (ihipException ex) { + e = ex._code; + } + + return ihipLogStatus(e); +} + hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { From 70c94d7b835dcff1a7d43a065ccea1218bbc48f5 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 3 May 2017 21:59:48 +0300 Subject: [PATCH 063/171] [HIPIFY] HIPIFY and HIP sync with CUDA Driver API data types. + Update CUDA_Driver_API_functions_supported_by_HIP.md. + Final update of HIPIFY with CUDA driver data types. [TODO] Syncing HIPIFY and HIP by CUDA Driver API functions. --- ...A_Driver_API_functions_supported_by_HIP.md | 62 ++++ hipify-clang/src/Cuda2Hip.cpp | 324 +++++++++++------- 2 files changed, 253 insertions(+), 133 deletions(-) diff --git a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index 3434d29a70..ad9d791a6d 100644 --- a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -323,6 +323,68 @@ | 500 |*`CUDA_ERROR_NOT_FOUND`* |*`hipErrorNotFound`* | This indicates that a named symbol was not found. Examples of symbols are global/constant variable names, texture names, and surface names. | | 600 |*`CUDA_ERROR_NOT_READY`* |*`hipErrorNotReady`* | This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than CUDA_SUCCESS (which indicates completion). Calls that may return this value include cuEventQuery() and cuStreamQuery(). | | 700 |*`CUDA_ERROR_ILLEGAL_ADDRESS`* |*`hipErrorIllegalAddress`* | While executing a kernel, the device encountered a load or store instruction on an invalid memory address. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 701 |*`CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`* |*`hipErrorLaunchOutOfResources`* | This indicates that a launch did not occur because it did not have appropriate resources. This error usually indicates that the user has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too many threads for the kernel's register count. Passing arguments of the wrong size (i.e. a 64-bit pointer when a 32-bit int is expected) is equivalent to passing too many arguments and can also result in this error. | +| 702 |*`CUDA_ERROR_LAUNCH_TIMEOUT`* |*`hipErrorLaunchTimeOut`* | This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device attribute CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The context cannot be used (and must be destroyed similar to CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 703 |*`CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`* | | This error indicates a kernel launch that uses an incompatible texturing mode. | +| 704 |*`CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED`* |*`hipErrorPeerAccessAlreadyEnabled`* | This error indicates that a call to cuCtxEnablePeerAccess() is trying to re-enable peer access to a context which has already had peer access to it enabled. | +| 705 |*`CUDA_ERROR_PEER_ACCESS_NOT_ENABLED`* |*`hipErrorPeerAccessNotEnabled`* | This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess(). | +| 708 |*`CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE`* | | This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess(). | +| 709 |*`CUDA_ERROR_CONTEXT_IS_DESTROYED`* | | This error indicates that the context current to the calling thread has been destroyed using cuCtxDestroy, or is a primary context which has not yet been initialized. | +| 710 |*`CUDA_ERROR_ASSERT`* | | A device-side assert triggered during kernel execution. The context cannot be used anymore, and must be destroyed. All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 711 |*`CUDA_ERROR_TOO_MANY_PEERS`* | | This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to cuCtxEnablePeerAccess(). | +| 712 |*`CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED`* |*`hipErrorHostMemoryAlreadyRegistered`* | This error indicates that the memory range passed to cuMemHostRegister() has already been registered. | +| 713 |*`CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED`* |*`hipErrorHostMemoryNotRegistered`* | This error indicates that the pointer passed to cuMemHostUnregister() does not correspond to any currently registered memory region. | +| 714 |*`CUDA_ERROR_HARDWARE_STACK_ERROR`* | | While executing a kernel, the device encountered a stack error. This can be due to stack corruption or exceeding the stack size limit. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 715 |*`CUDA_ERROR_ILLEGAL_INSTRUCTION`* | | While executing a kernel, the device encountered an illegal instruction. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 716 |*`CUDA_ERROR_MISALIGNED_ADDRESS`* | | While executing a kernel, the device encountered a load or store instruction on a memory address which is not aligned. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 717 |*`CUDA_ERROR_INVALID_ADDRESS_SPACE`* | | While executing a kernel, the device encountered an instruction which can only operate on memory locations in certain address spaces (global, shared, or local), but was supplied a memory address not belonging to an allowed address space. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 718 |*`CUDA_ERROR_INVALID_PC`* | | While executing a kernel, the device program counter wrapped its address space. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 719 |*`CUDA_ERROR_LAUNCH_FAILED`* | | An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 800 |*`CUDA_ERROR_NOT_PERMITTED`* | | This error indicates that the attempted operation is not permitted. | +| 801 |*`CUDA_ERROR_NOT_SUPPORTED`* | | This error indicates that the attempted operation is not supported on the current system or device. | +| 999 |*`CUDA_ERROR_UNKNOWN`* | | This indicates that an unknown internal error has occurred. | +| enum |***`CUstream_flags`*** |***`hipStreamFlags`*** | Stream creation flags | +| 0x0 |*`CU_STREAM_DEFAULT`* |*`hipStreamDefault`* | Default stream flag | +| 0x1 |*`CU_STREAM_NON_BLOCKING`* |*`hipStreamNonBlocking`* | Stream does not synchronize with stream 0 (the NULL stream) | +| typedef | `CUarray` | `hipArray *` | CUDA array | +| struct | `CUarray_st` | `hipArray` | CUDA array | +| typedef | `CUcontext` | `hipCtx_t` | CUDA context | +| typedef | `CUdevice` | `hipDevice_t` | CUDA device | +| typedef | `CUdeviceptr` | `hipDeviceptr_t` | CUDA device pointer CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. | +| typedef | `CUevent` | `hipEvent_t` | CUDA event | +| typedef | `CUfunction` | `hipFunction_t` | CUDA function | +| typedef | `CUgraphicsResource` | | CUDA graphics interop resource | +| typedef | `CUmipmappedArray` | | CUDA mipmapped array | +| typedef | `CUmodule` | `hipModule_t` | CUDA module | +| typedef | `CUstream` | `hipStream_t` | CUDA module | +| typedef | `CUstreamCallback` | `hipStreamCallback_t` | CUDA stream callback | +| typedef | `CUsurfObject` | | An opaque value that represents a CUDA surface object | +| typedef | `CUsurfref` | | CUDA surface reference | +| typedef | `CUtexObject` | | An opaque value that represents a CUDA texture object | +| typedef | `CUtexref` | | CUDA texture reference | +| define |`CU_IPC_HANDLE_SIZE` | | CUDA IPC handle size. | +| define |`CU_LAUNCH_PARAM_BUFFER_POINTER` | `HIP_LAUNCH_PARAM_BUFFER_POINTER` | Indicator that the next value in the extra parameter to cuLaunchKernel will be a pointer to a buffer containing all kernel parameters used for launching kernel f. This buffer needs to honor all alignment/padding requirements of the individual parameters. If CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the extra array, then CU_LAUNCH_PARAM_BUFFER_POINTER will have no effect. | +| define |`CU_LAUNCH_PARAM_BUFFER_SIZE` | `HIP_LAUNCH_PARAM_BUFFER_SIZE` | Indicator that the next value in the extra parameter to cuLaunchKernel will be a pointer to a size_t which contains the size of the buffer specified with CU_LAUNCH_PARAM_BUFFER_POINTER. It is required that CU_LAUNCH_PARAM_BUFFER_POINTER also be specified in the extra array if the value associated with CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. | +| define |`CU_LAUNCH_PARAM_END` | `HIP_LAUNCH_PARAM_END` | End of array terminator for the extra parameter to cuLaunchKernel. | +| define |`CU_MEMHOSTALLOC_DEVICEMAP` | | If set, host memory is mapped into CUDA address space and cuMemHostGetDevicePointer() may be called on the host pointer. Flag for cuMemHostAlloc(). | +| define |`CU_MEMHOSTALLOC_PORTABLE` | | If set, host memory is portable between CUDA contexts. Flag for cuMemHostAlloc(). | +| define |`CU_MEMHOSTALLOC_WRITECOMBINED` | | If set, host memory is allocated as write-combined - fast to write, faster to DMA, slow to read except via SSE4 streaming load instruction (MOVNTDQA). Flag for cuMemHostAlloc(). | +| define |`CU_MEMHOSTREGISTER_DEVICEMAP` | | If set, host memory is mapped into CUDA address space and cuMemHostGetDevicePointer() may be called on the host pointer. Flag for cuMemHostRegister(). | +| define |`CU_MEMHOSTREGISTER_IOMEMORY` | | If set, the passed memory pointer is treated as pointing to some memory-mapped I/O space, e.g. belonging to a third-party PCIe device. On Windows the flag is a no-op. On Linux that memory is marked as non cache-coherent for the GPU and is expected to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED is returned. Flag for cuMemHostRegister(). | +| define |`CU_MEMHOSTREGISTER_PORTABLE` | | If set, host memory is portable between CUDA contexts. Flag for cuMemHostRegister(). | +| define |`CU_PARAM_TR_DEFAULT` | | For texture references loaded into the module, use default texunit from texture reference. | +| define |`CU_STREAM_LEGACY` | | Legacy stream handle. Stream handle that can be passed as a CUstream to use an implicit stream with legacy synchronization behavior. See details of the synchronization behavior. | +| define |`CU_STREAM_PER_THREAD` | | Per-thread stream handle. Stream handle that can be passed as a CUstream to use an implicit stream with perthread synchronization behavior. See details of the synchronization behavior. | +| define |`CU_TRSA_OVERRIDE_FORMAT` | | Override the texref format with a format inferred from the array. Flag for cuTexRefSetArray(). | +| define |`CU_TRSF_NORMALIZED_COORDINATES` | | Use normalized texture coordinates in the range [0,1) instead of [0,dim). Flag for cuTexRefSetFlags(). | +| define |`CU_TRSF_SRGB` | | Perform sRGB->linear conversion during texture read. Flag for cuTexRefSetFlags(). | +| define |`CUDA_ARRAY3D_2DARRAY` | | Deprecated, use CUDA_ARRAY3D_LAYERED. | +| define |`CUDA_ARRAY3D_CUBEMAP` | | If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The width of such a CUDA array must be equal to its height, and Depth must be six. If CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps and Depth must be a multiple of six. | +| define |`CUDA_ARRAY3D_DEPTH_TEXTURE` | | This flag if set indicates that the CUDA array is a DEPTH_TEXTURE. | +| define |`CUDA_ARRAY3D_LAYERED` | | If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of a 3D array. | +| define |`CUDA_ARRAY3D_SURFACE_LDST` | | This flag must be set in order to bind a surface reference to the CUDA array. | +| define |`CUDA_ARRAY3D_TEXTURE_GATHER` | | This flag must be set in order to perform texture gather operations on a CUDA array. | +| define |`CUDA_VERSION` | | CUDA API version number. | ## **2. Error Handling** diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 390b4ee88c..b3b2b993e3 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -208,59 +208,55 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 + cuda2hipRename["CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"] = {"hipErrorLaunchIncompatibleTexturing", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 703 + cuda2hipRename["CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"] = {"hipErrorPrimaryContextActive", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 708 + cuda2hipRename["CUDA_ERROR_CONTEXT_IS_DESTROYED"] = {"hipErrorContextIsDestroyed", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 709 + cuda2hipRename["CUDA_ERROR_NOT_PERMITTED"] = {"hipErrorNotPermitted", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 800 + cuda2hipRename["CUDA_ERROR_NOT_SUPPORTED"] = {"hipErrorNotSupported", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 801 // CUDA RT API error code only - cuda2hipRename["cudaErrorMissingConfiguration"] = {"hipErrorMissingConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["cudaErrorPriorLaunchFailure"] = {"hipErrorPriorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 5 - cuda2hipRename["cudaErrorInvalidDeviceFunction"] = {"hipErrorInvalidDeviceFunction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 8 - cuda2hipRename["cudaErrorInvalidConfiguration"] = {"hipErrorInvalidConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 9 - cuda2hipRename["cudaErrorInvalidPitchValue"] = {"hipErrorInvalidPitchValue", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 12 - cuda2hipRename["cudaErrorInvalidSymbol"] = {"hipErrorInvalidSymbol", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 13 - cuda2hipRename["cudaErrorInvalidHostPointer"] = {"hipErrorInvalidHostPointer", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 16 - cuda2hipRename["cudaErrorInvalidDevicePointer"] = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; // 17 - cuda2hipRename["cudaErrorInvalidTexture"] = {"hipErrorInvalidTexture", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 18 - cuda2hipRename["cudaErrorInvalidTextureBinding"] = {"hipErrorInvalidTextureBinding", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 19 - cuda2hipRename["cudaErrorInvalidChannelDescriptor"] = {"hipErrorInvalidChannelDescriptor", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 20 - cuda2hipRename["cudaErrorInvalidMemcpyDirection"] = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 21 - cuda2hipRename["cudaErrorAddressOfConstant"] = {"hipErrorAddressOfConstant", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 22 - cuda2hipRename["cudaErrorTextureFetchFailed"] = {"hipErrorTextureFetchFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 23 - cuda2hipRename["cudaErrorTextureNotBound"] = {"hipErrorTextureNotBound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 24 - cuda2hipRename["cudaErrorSynchronizationError"] = {"hipErrorSynchronizationError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 25 - cuda2hipRename["cudaErrorInvalidFilterSetting"] = {"hipErrorInvalidFilterSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 26 - cuda2hipRename["cudaErrorInvalidNormSetting"] = {"hipErrorInvalidNormSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 27 - cuda2hipRename["cudaErrorMixedDeviceExecution"] = {"hipErrorMixedDeviceExecution", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 28 + cuda2hipRename["cudaErrorMissingConfiguration"] = {"hipErrorMissingConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["cudaErrorPriorLaunchFailure"] = {"hipErrorPriorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 5 + cuda2hipRename["cudaErrorInvalidDeviceFunction"] = {"hipErrorInvalidDeviceFunction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 8 + cuda2hipRename["cudaErrorInvalidConfiguration"] = {"hipErrorInvalidConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 9 + cuda2hipRename["cudaErrorInvalidPitchValue"] = {"hipErrorInvalidPitchValue", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 12 + cuda2hipRename["cudaErrorInvalidSymbol"] = {"hipErrorInvalidSymbol", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 13 + cuda2hipRename["cudaErrorInvalidHostPointer"] = {"hipErrorInvalidHostPointer", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 16 + cuda2hipRename["cudaErrorInvalidDevicePointer"] = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; // 17 + cuda2hipRename["cudaErrorInvalidTexture"] = {"hipErrorInvalidTexture", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 18 + cuda2hipRename["cudaErrorInvalidTextureBinding"] = {"hipErrorInvalidTextureBinding", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 19 + cuda2hipRename["cudaErrorInvalidChannelDescriptor"] = {"hipErrorInvalidChannelDescriptor", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 20 + cuda2hipRename["cudaErrorInvalidMemcpyDirection"] = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 21 + cuda2hipRename["cudaErrorAddressOfConstant"] = {"hipErrorAddressOfConstant", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 22 + cuda2hipRename["cudaErrorTextureFetchFailed"] = {"hipErrorTextureFetchFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 23 + cuda2hipRename["cudaErrorTextureNotBound"] = {"hipErrorTextureNotBound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 24 + cuda2hipRename["cudaErrorSynchronizationError"] = {"hipErrorSynchronizationError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 25 + cuda2hipRename["cudaErrorInvalidFilterSetting"] = {"hipErrorInvalidFilterSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 26 + cuda2hipRename["cudaErrorInvalidNormSetting"] = {"hipErrorInvalidNormSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 27 + cuda2hipRename["cudaErrorMixedDeviceExecution"] = {"hipErrorMixedDeviceExecution", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 28 // Deprecated as of CUDA 4.1 - cuda2hipRename["cudaErrorNotYetImplemented"] = {"hipErrorNotYetImplemented", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 31 + cuda2hipRename["cudaErrorNotYetImplemented"] = {"hipErrorNotYetImplemented", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 31 // Deprecated as of CUDA 3.1 - cuda2hipRename["cudaErrorMemoryValueTooLarge"] = {"hipErrorMemoryValueTooLarge", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 32 - cuda2hipRename["cudaErrorInsufficientDriver"] = {"hipErrorInsufficientDriver", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 35 - cuda2hipRename["cudaErrorSetOnActiveProcess"] = {"hipErrorSetOnActiveProcess", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 36 - cuda2hipRename["cudaErrorInvalidSurface"] = {"hipErrorInvalidSurface", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 37 - cuda2hipRename["cudaErrorDuplicateVariableName"] = {"hipErrorDuplicateVariableName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 43 - cuda2hipRename["cudaErrorDuplicateTextureName"] = {"hipErrorDuplicateTextureName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 44 - cuda2hipRename["cudaErrorDuplicateSurfaceName"] = {"hipErrorDuplicateSurfaceName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 45 - cuda2hipRename["cudaErrorDevicesUnavailable"] = {"hipErrorDevicesUnavailable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 46 - cuda2hipRename["cudaErrorIncompatibleDriverContext"] = {"hipErrorIncompatibleDriverContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 49 - cuda2hipRename["cudaErrorDeviceAlreadyInUse"] = {"hipErrorDeviceAlreadyInUse", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 54 - cuda2hipRename["cudaErrorAssert"] = {"hipErrorAssert", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 59 - cuda2hipRename["cudaErrorTooManyPeers"] = {"hipErrorTooManyPeers", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 60 - cuda2hipRename["cudaErrorLaunchMaxDepthExceeded"] = {"hipErrorLaunchMaxDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 65 - cuda2hipRename["cudaErrorLaunchFileScopedTex"] = {"hipErrorLaunchFileScopedTex", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 66 - cuda2hipRename["cudaErrorLaunchFileScopedSurf"] = {"hipErrorLaunchFileScopedSurf", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 67 - cuda2hipRename["cudaErrorSyncDepthExceeded"] = {"hipErrorSyncDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 68 - cuda2hipRename["cudaErrorLaunchPendingCountExceeded"] = {"hipErrorLaunchPendingCountExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 69 - cuda2hipRename["cudaErrorNotPermitted"] = {"hipErrorNotPermitted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 70 - cuda2hipRename["cudaErrorNotSupported"] = {"hipErrorNotSupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 71 - cuda2hipRename["cudaErrorHardwareStackError"] = {"hipErrorHardwareStackError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 72 - cuda2hipRename["cudaErrorIllegalInstruction"] = {"hipErrorIllegalInstruction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 73 - cuda2hipRename["cudaErrorMisalignedAddress"] = {"hipErrorMisalignedAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 74 - cuda2hipRename["cudaErrorInvalidAddressSpace"] = {"hipErrorInvalidAddressSpace", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 75 - cuda2hipRename["cudaErrorInvalidPc"] = {"hipErrorInvalidPc", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 76 - cuda2hipRename["cudaErrorStartupFailure"] = {"hipErrorStartupFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 0x7f + cuda2hipRename["cudaErrorMemoryValueTooLarge"] = {"hipErrorMemoryValueTooLarge", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 32 + cuda2hipRename["cudaErrorInsufficientDriver"] = {"hipErrorInsufficientDriver", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 35 + cuda2hipRename["cudaErrorSetOnActiveProcess"] = {"hipErrorSetOnActiveProcess", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 36 + cuda2hipRename["cudaErrorInvalidSurface"] = {"hipErrorInvalidSurface", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 37 + cuda2hipRename["cudaErrorDuplicateVariableName"] = {"hipErrorDuplicateVariableName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 43 + cuda2hipRename["cudaErrorDuplicateTextureName"] = {"hipErrorDuplicateTextureName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 44 + cuda2hipRename["cudaErrorDuplicateSurfaceName"] = {"hipErrorDuplicateSurfaceName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 45 + cuda2hipRename["cudaErrorDevicesUnavailable"] = {"hipErrorDevicesUnavailable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 46 + cuda2hipRename["cudaErrorIncompatibleDriverContext"] = {"hipErrorIncompatibleDriverContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 49 + cuda2hipRename["cudaErrorDeviceAlreadyInUse"] = {"hipErrorDeviceAlreadyInUse", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 54 + cuda2hipRename["cudaErrorLaunchMaxDepthExceeded"] = {"hipErrorLaunchMaxDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 65 + cuda2hipRename["cudaErrorLaunchFileScopedTex"] = {"hipErrorLaunchFileScopedTex", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 66 + cuda2hipRename["cudaErrorLaunchFileScopedSurf"] = {"hipErrorLaunchFileScopedSurf", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 67 + cuda2hipRename["cudaErrorSyncDepthExceeded"] = {"hipErrorSyncDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 68 + cuda2hipRename["cudaErrorLaunchPendingCountExceeded"] = {"hipErrorLaunchPendingCountExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 69 + cuda2hipRename["cudaErrorNotPermitted"] = {"hipErrorNotPermitted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 70 + cuda2hipRename["cudaErrorNotSupported"] = {"hipErrorNotSupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 71 + cuda2hipRename["cudaErrorStartupFailure"] = {"hipErrorStartupFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 0x7f // Deprecated as of CUDA 4.1 - cuda2hipRename["cudaErrorApiFailureBase"] = {"hipErrorApiFailureBase", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 10000 - - + cuda2hipRename["cudaErrorApiFailureBase"] = {"hipErrorApiFailureBase", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 10000 cuda2hipRename["CUDA_SUCCESS"] = {"hipSuccess", CONV_ERR, API_DRIVER}; // 0 cuda2hipRename["cudaSuccess"] = {"hipSuccess", CONV_ERR, API_RUNTIME}; // 0 @@ -346,34 +342,50 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_ILLEGAL_ADDRESS"] = {"hipErrorIllegalAddress", CONV_ERR, API_DRIVER}; // 700 cuda2hipRename["cudaErrorIllegalAddress"] = {"hipErrorIllegalAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 77 - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + cuda2hipRename["CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_DRIVER}; // 701 + cuda2hipRename["cudaErrorLaunchOutOfResources"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_RUNTIME}; // 7 - cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"] = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER}; // 719 + cuda2hipRename["CUDA_ERROR_LAUNCH_TIMEOUT"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_DRIVER}; // 702 + cuda2hipRename["cudaErrorLaunchTimeout"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 6 + + cuda2hipRename["CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_DRIVER}; // 704 + cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME}; // 50 + + cuda2hipRename["CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_DRIVER}; // 705 + cuda2hipRename["cudaErrorPeerAccessNotEnabled"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME}; // 51 + + cuda2hipRename["CUDA_ERROR_ASSERT"] = {"hipErrorAssert", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 710 + cuda2hipRename["cudaErrorAssert"] = {"hipErrorAssert", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 59 + + cuda2hipRename["CUDA_ERROR_TOO_MANY_PEERS"] = {"hipErrorTooManyPeers", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 711 + cuda2hipRename["cudaErrorTooManyPeers"] = {"hipErrorTooManyPeers", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 60 + + cuda2hipRename["CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_DRIVER}; // 712 + cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME}; // 61 + + cuda2hipRename["CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_DRIVER}; // 713 + cuda2hipRename["cudaErrorHostMemoryNotRegistered"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME}; // 62 + + cuda2hipRename["CUDA_ERROR_HARDWARE_STACK_ERROR"] = {"hipErrorHardwareStackError", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 714 + cuda2hipRename["cudaErrorHardwareStackError"] = {"hipErrorHardwareStackError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 72 + + cuda2hipRename["CUDA_ERROR_ILLEGAL_INSTRUCTION"] = {"hipErrorIllegalInstruction", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 715 + cuda2hipRename["cudaErrorIllegalInstruction"] = {"hipErrorIllegalInstruction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 73 + + cuda2hipRename["CUDA_ERROR_MISALIGNED_ADDRESS"] = {"hipErrorMisalignedAddress", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 716 + cuda2hipRename["cudaErrorMisalignedAddress"] = {"hipErrorMisalignedAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 74 + + cuda2hipRename["CUDA_ERROR_INVALID_ADDRESS_SPACE"] = {"hipErrorInvalidAddressSpace", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 717 + cuda2hipRename["cudaErrorInvalidAddressSpace"] = {"hipErrorInvalidAddressSpace", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 75 + + cuda2hipRename["CUDA_ERROR_INVALID_PC"] = {"hipErrorInvalidPc", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 718 + cuda2hipRename["cudaErrorInvalidPc"] = {"hipErrorInvalidPc", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 76 + + cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"] = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 719 cuda2hipRename["cudaErrorLaunchFailure"] = {"hipErrorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 4 - cuda2hipRename["CUDA_ERROR_LAUNCH_TIMEOUT"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorLaunchTimeout"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 6 - - cuda2hipRename["CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorLaunchOutOfResources"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_RUNTIME}; // 7 - - cuda2hipRename["CUDA_ERROR_UNKNOWN"] = {"hipErrorUnknown", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30 - -// cuda2hipRename["CUDA_ERROR_NOT_INITIALIZED"] = {"hipErrorInitializationError", CONV_ERR, API_DRIVER}; -// cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInitializationError", CONV_ERR, API_RUNTIME}; - - cuda2hipRename["CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME}; // 50 - - cuda2hipRename["CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorPeerAccessNotEnabled"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME}; // 51 - - cuda2hipRename["CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME}; // 61 - - cuda2hipRename["CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorHostMemoryNotRegistered"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME}; // 62 + cuda2hipRename["CUDA_ERROR_UNKNOWN"] = {"hipErrorUnknown", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 999 + cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30 ///////////////////////////// CUDA DRIVER API ///////////////////////////// // enums @@ -389,59 +401,86 @@ struct cuda2hipMap { cuda2hipRename["CUipcEventHandle"] = {"hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CUipcMemHandle"] = {"hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - - cuda2hipRename["CUaddress_mode"] = {"hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_TR_ADDRESS_MODE_WRAP"] = {"HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0 - cuda2hipRename["CU_TR_ADDRESS_MODE_CLAMP"] = {"HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_TR_ADDRESS_MODE_MIRROR"] = {"HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_TR_ADDRESS_MODE_BORDER"] = {"HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CU_TR_ADDRESS_MODE_WRAP"] = {"HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0 + cuda2hipRename["CU_TR_ADDRESS_MODE_CLAMP"] = {"HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_TR_ADDRESS_MODE_MIRROR"] = {"HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_TR_ADDRESS_MODE_BORDER"] = {"HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 cuda2hipRename["CUarray_cubemap_face"] = {"hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_X"] = {"HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_X"] = {"HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Y"] = {"HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Y"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 - cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Z"] = {"HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Z"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x05 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_X"] = {"HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_X"] = {"HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Y"] = {"HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Y"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Z"] = {"HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Z"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x05 cuda2hipRename["CUarray_format"] = {"hipArray_format", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT8"] = {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT16"] = {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT32"] = {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 - cuda2hipRename["CU_AD_FORMAT_SIGNED_INT8"] = {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_AD_FORMAT_SIGNED_INT16"] = {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x09 - cuda2hipRename["CU_AD_FORMAT_SIGNED_INT32"] = {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a - cuda2hipRename["CU_AD_FORMAT_HALF"] = {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_AD_FORMAT_FLOAT"] = {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT8"] = {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT16"] = {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT32"] = {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT8"] = {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT16"] = {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x09 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT32"] = {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a + cuda2hipRename["CU_AD_FORMAT_HALF"] = {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_AD_FORMAT_FLOAT"] = {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 // Compute mode - cuda2hipRename["CUcomputemode"] = {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) - cuda2hipRename["CU_COMPUTEMODE_DEFAULT"] = {"hipComputeModeDefault", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0 - cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CUcomputemode"] = {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) + cuda2hipRename["CU_COMPUTEMODE_DEFAULT"] = {"hipComputeModeDefault", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0 // API_RUNTIME ANALOGUE (cudaComputeModeDefault = 0) + cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaComputeModeExclusive = 1) + cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaComputeModeProhibited = 2) + cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaComputeModeExclusiveProcess = 3) // Context flags cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 - cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f + cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 + cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f // Defines - cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_SIZE"] = {"HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_LAUNCH_PARAM_END"] = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER}; + cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; // ((void*)0x01) + cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_SIZE"] = {"HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_DEV, API_DRIVER}; // ((void*)0x02) + cuda2hipRename["CU_LAUNCH_PARAM_END"] = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER}; // ((void*)0x00) + cuda2hipRename["CU_IPC_HANDLE_SIZE"] = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 + cuda2hipRename["CU_MEMHOSTALLOC_DEVICEMAP"] = {"HIP_MEMHOSTALLOC_DEVICEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_MEMHOSTALLOC_PORTABLE"] = {"HIP_MEMHOSTALLOC_PORTABLE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_MEMHOSTALLOC_WRITECOMBINED"] = {"HIP_MEMHOSTALLOC_WRITECOMBINED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_MEMHOSTREGISTER_DEVICEMAP"] = {"HIP_MEMHOSTREGISTER_DEVICEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_MEMHOSTREGISTER_IOMEMORY"] = {"HIP_MEMHOSTREGISTER_IOMEMORY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_MEMHOSTREGISTER_PORTABLE"] = {"HIP_MEMHOSTREGISTER_PORTABLE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_PARAM_TR_DEFAULT"] = {"HIP_PARAM_TR_DEFAULT", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // -1 + cuda2hipRename["CU_STREAM_LEGACY"] = {"HIP_STREAM_LEGACY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // ((CUstream)0x1) + cuda2hipRename["CU_STREAM_PER_THREAD"] = {"HIP_STREAM_PER_THREAD", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // ((CUstream)0x2) + cuda2hipRename["CU_TRSA_OVERRIDE_FORMAT"] = {"HIP_TRSA_OVERRIDE_FORMAT", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_TRSF_NORMALIZED_COORDINATES"] = {"HIP_TRSF_NORMALIZED_COORDINATES", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};// 0x02 + cuda2hipRename["CU_TRSF_READ_AS_INTEGER"] = {"HIP_TRSF_READ_AS_INTEGER", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_TRSF_SRGB"] = {"HIP_TRSF_SRGB", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + // Deprecated, use CUDA_ARRAY3D_LAYERED + cuda2hipRename["CUDA_ARRAY3D_2DARRAY"] = {"HIP_ARRAY3D_LAYERED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CUDA_ARRAY3D_CUBEMAP"] = {"HIP_ARRAY3D_CUBEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CUDA_ARRAY3D_DEPTH_TEXTURE"] = {"HIP_ARRAY3D_DEPTH_TEXTURE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CUDA_ARRAY3D_LAYERED"] = {"HIP_ARRAY3D_LAYERED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CUDA_ARRAY3D_SURFACE_LDST"] = {"HIP_ARRAY3D_SURFACE_LDST", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CUDA_ARRAY3D_TEXTURE_GATHER"] = {"HIP_ARRAY3D_TEXTURE_GATHER", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CUDA_VERSION"] = {"HIP_VERSION", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7050 // Types // NOTE: CUdevice might be changed to typedef int in the future. cuda2hipRename["CUdevice"] = {"hipDevice_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) - cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdeviceptr"] = {"hipDeviceptr_t", CONV_TYPE, API_DRIVER}; + // CUDA: "The types::CUarray and struct ::cudaArray * represent the same data type and may be used interchangeably by casting the two types between each other." + // typedef struct cudaArray *cudaArray_t; + // typedef struct CUarray_st *CUarray; + cuda2hipRename["CUarray_st"] = {"hipArray", CONV_MEM, API_RUNTIME}; // API_Runtime ANALOGUE (cudaArray) + cuda2hipRename["CUarray"] = {"hipArray *", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaArray_t) + // unsupported yet by HIP cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) @@ -554,8 +593,8 @@ struct cuda2hipMap { // TODO: Analogues enum is needed in HIP. Couldn't map enum to struct hipPointerAttribute_t. // TODO: Do for Pointer Attributes the same as for Device Attributes. - // cuda2hipRename["CUpointer_attribute_enum"] = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) - // cuda2hipRename["CUpointer_attribute"] = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + // cuda2hipRename["CUpointer_attribute_enum"] = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + // cuda2hipRename["CUpointer_attribute"] = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_CONTEXT"] = {"hipPointerAttributeContext", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_MEMORY_TYPE"] = {"hipPointerAttributeMemoryType", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_DEVICE_POINTER"] = {"hipPointerAttributeDevicePointer", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (no) @@ -565,13 +604,17 @@ struct cuda2hipMap { cuda2hipRename["CU_POINTER_ATTRIBUTE_BUFFER_ID"] = {"hipPointerAttributeBufferId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_IS_MANAGED"] = {"hipPointerAttributeIsManaged", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (no) - // pointer to CUfunc_st cuda2hipRename["CUfunction"] = {"hipFunction_t", CONV_TYPE, API_DRIVER}; - // TODO: in HIP ihipModuleSymbol_t should be declared in hip_runtime_api.h, not in hcc_detail/hip_runtime_api.h, as it's analogue CUfunc_st is declared also in cuda.h - // ToDO: examples are needed with CUfunc_st + // TODO: move "typedef struct ihipModuleSymbol_t *hipFunction_t;" from hcc_details to HIP + // typedef struct CUfunc_st *CUfunction; // cuda2hipRename["CUfunc_st"] = {"ihipModuleSymbol_t", CONV_TYPE, API_DRIVER}; + // typedef struct CUgraphicsResource_st *CUgraphicsResource; + cuda2hipRename["CUgraphicsResource"] = {"hipGraphicsResource_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // typedef struct CUmipmappedArray_st *CUmipmappedArray; + cuda2hipRename["CUmipmappedArray"] = {"hipMipmappedArray_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // unsupported yet by HIP cuda2hipRename["CUfunction_attribute"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CUfunction_attribute_enum"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; @@ -607,8 +650,6 @@ struct cuda2hipMap { cuda2hipRename["CU_OCCUPANCY_DEFAULT"] = {"hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaOccupancyDefault = 0x0) cuda2hipRename["CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE"] = {"hipOccupancyDisableCachingOverride", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaOccupancyDisableCachingOverride = 0x1) - - cuda2hipRename["CUfunc_cache_enum"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaFuncCache) cuda2hipRename["CUfunc_cache"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaFuncCache) cuda2hipRename["CU_FUNC_CACHE_PREFER_NONE"] = {"hipFuncCachePreferNone", CONV_CACHE, API_DRIVER}; // 0x00 // API_Runtime ANALOGUE (cudaFilterModePoint = 0) @@ -752,24 +793,41 @@ struct cuda2hipMap { cuda2hipRename["CU_RES_VIEW_FORMAT_SIGNED_BC6H"] = {"hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x21 // API_Runtime ANALOGUE (cudaResViewFormatSignedBlockCompressed6H = 0x21) cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC7"] = {"hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x22 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed7 = 0x22) - - - cuda2hipRename["CUsharedconfig_enum"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUsharedconfig"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUsharedconfig_enum"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; cuda2hipRename["CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE"] = {"hipSharedMemBankSizeDefault", CONV_DEV, API_DRIVER}; cuda2hipRename["CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_DRIVER}; cuda2hipRename["CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_DRIVER}; cuda2hipRename["CUcontext"] = {"hipCtx_t", CONV_TYPE, API_DRIVER}; - // TODO: - // cuda2hipRename["CUctx_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; + // TODO: move "typedef struct ihipCtx_t *hipCtx_t;" from hcc_details to HIP + // typedef struct CUctx_st *CUcontext; + // cuda2hipRename["CUctx_st"] = {"ihipCtx_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUmodule"] = {"hipModule_t", CONV_TYPE, API_DRIVER}; - // TODO: - // cuda2hipRename["CUmod_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; + // TODO: move "typedef struct ihipModule_t *hipModule_t;" from hcc_details to HIP + // typedef struct CUmod_st *CUmodule; + // cuda2hipRename["CUmod_st"] = {"ihipModule_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUstream"] = {"hipStream_t", CONV_TYPE, API_DRIVER}; - // TODO: - // cuda2hipRename["CUstream_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; - // Stream Flags + // TODO: move "typedef struct ihipStream_t *hipStream_t;" from hcc_details to HIP + // typedef struct CUstream_st *CUstream; + // cuda2hipRename["CUstream_st"] = {"ihipStream_t", CONV_TYPE, API_DRIVER}; + + // typedef void (*hipStreamCallback_t) (hipStream_t stream, hipError_t status, void* userData); + // typedef void (CUDA_CB *CUstreamCallback) (CUstream hStream, CUresult status, void* userData) + cuda2hipRename["CUstreamCallback"] = {"hipStreamCallback_t", CONV_TYPE, API_DRIVER}; + + cuda2hipRename["CUsurfObject"] = {"hipSurfaceObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // typedef struct CUsurfref_st *CUsurfref; + cuda2hipRename["CUsurfref"] = {"hipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUsurfref_st"] = {"ihipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUtexObject"] = {"hipTextureObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // typedef struct CUtexref_st *CUtexref; + cuda2hipRename["CUtexref"] = {"hipTextureReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUtexref_st"] = {"ihipTextureReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + + // Stream Flags enum + cuda2hipRename["CUstream_flags"] = {"hipStreamFlags", CONV_STREAM, API_DRIVER}; + // cuda2hipRename["CUstream_flags_enum"] = {"hipStreamFlags", CONV_STREAM, API_DRIVER}; cuda2hipRename["CU_STREAM_DEFAULT"] = {"hipStreamDefault", CONV_STREAM, API_DRIVER}; cuda2hipRename["CU_STREAM_NON_BLOCKING"] = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER}; @@ -1254,10 +1312,10 @@ struct cuda2hipMap { // Compute mode cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) - cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 - cuda2hipRename["cudaComputeModeExclusive"] = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["cudaComputeModeProhibited"] = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["cudaComputeModeExclusiveProcess"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_DEFAULT = 0) + cuda2hipRename["cudaComputeModeExclusive"] = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_EXCLUSIVE = 1) + cuda2hipRename["cudaComputeModeProhibited"] = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_PROHIBITED = 2) + cuda2hipRename["cudaComputeModeExclusiveProcess"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3) // Device Flags // unsupported yet by HIP From 126989760608eeac800a3942c672070dbf59fed9 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 3 May 2017 22:45:56 +0300 Subject: [PATCH 064/171] [HIPIFY] Blas update: add a few functions, supported by HIP. cublasDaxpy -> hipblasDaxpy cublasDgemv -> hipblasDgemv cublasDger -> hipblasDger cublasDgemm -> hipblasDgemm cublasDgemmBatched -> hipblasDgemmBatched cublasGetStream -> hipblasGetStream cublasSetStream -> hipblasSetStream cublasDaxpy -> hipblasDaxpy --- hipify-clang/src/Cuda2Hip.cpp | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index b3b2b993e3..3b34f0c1c1 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -1601,7 +1601,7 @@ struct cuda2hipMap { // Blas types cuda2hipRename["cublasHandle_t"] = {"hipblasHandle_t", CONV_TYPE, API_BLAS}; // TODO: dereferencing: typedef struct cublasContext *cublasHandle_t; - cuda2hipRename["cublasContext"] = {"hipblasHandle_t", CONV_TYPE, API_BLAS}; + // cuda2hipRename["cublasContext"] = {"hipblasHandle_t", CONV_TYPE, API_BLAS}; // Blas management functions // unsupported yet by hipblas/hcblas cuda2hipRename["cublasInit"] = {"hipblasInit", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1709,10 +1709,9 @@ struct cuda2hipMap { // AXPY cuda2hipRename["cublasSaxpy"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS}; - // there is no such a function in CUDA cuda2hipRename["cublasSaxpyBatched"] = {"hipblasSaxpyBatched", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCaxpy"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZaxpy"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1789,8 +1788,8 @@ struct cuda2hipMap { cuda2hipRename["cublasSgemv"] = {"hipblasSgemv", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA cuda2hipRename["cublasSgemvBatched"] = {"hipblasSgemvBatched", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasCgemv"] = {"hipblasCgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZgemv"] = {"hipblasZgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1868,8 +1867,8 @@ struct cuda2hipMap { // GER cuda2hipRename["cublasSger"] = {"hipblasSger", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDger"] = {"hipblasDger", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDger"] = {"hipblasDger", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasCgeru"] = {"hipblasCgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasCgerc"] = {"hipblasCgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZgeru"] = {"hipblasZgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1906,8 +1905,7 @@ struct cuda2hipMap { // Blas3 (v1) Routines // GEMM cuda2hipRename["cublasSgemm"] = {"hipblasSgemm", CONV_MATH_FUNC, API_BLAS}; - // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCgemm"] = {"hipblasCgemm", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas @@ -1915,8 +1913,7 @@ struct cuda2hipMap { // BATCH GEMM cuda2hipRename["cublasSgemmBatched"] = {"hipblasSgemmBatched", CONV_MATH_FUNC, API_BLAS}; - // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCgemmBatched"] = {"hipblasCgemmBatched", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas @@ -2074,10 +2071,9 @@ struct cuda2hipMap { cuda2hipRename["cublasCreate_v2"] = {"hipblasCreate", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasDestroy_v2"] = {"hipblasDestroy", CONV_MATH_FUNC, API_BLAS}; - // unsupported yet by hipblas/hcblas cuda2hipRename["cublasGetVersion_v2"] = {"hipblasGetVersion", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; - cuda2hipRename["cublasSetStream_v2"] = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; - cuda2hipRename["cublasGetStream_v2"] = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasSetStream_v2"] = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasGetStream_v2"] = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasGetPointerMode_v2"] = {"hipblasGetPointerMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasSetPointerMode_v2"] = {"hipblasSetPointerMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -2294,7 +2290,7 @@ struct cuda2hipMap { // AXPY cuda2hipRename["cublasSaxpy_v2"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCaxpy_v2"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZaxpy_v2"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; From 9b65358c682b8b1026c70d1e40035b68d0039073 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 3 May 2017 23:05:44 +0300 Subject: [PATCH 065/171] [HIPIFY] CUDA RT memcpy functions update. cudaMemcpyFromSymbol -> hipMemcpyFromSymbol cudaMemcpyFromSymbolAsync -> hipMemcpyFromSymbolAsync cudaMemcpy2DAsync -> hipMemcpy2DAsync --- hipify-clang/src/Cuda2Hip.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 3b34f0c1c1..07fc817b53 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -1030,10 +1030,10 @@ struct cuda2hipMap { cuda2hipRename["cudaMemcpyToSymbolAsync"] = {"hipMemcpyToSymbolAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpyAsync"] = {"hipMemcpyAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpy2D"] = {"hipMemcpy2D", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpy2DAsync"] = {"hipMemcpy2DAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpy2DToArray"] = {"hipMemcpy2DToArray", CONV_MEM, API_RUNTIME}; // unsupported yet by HIP cuda2hipRename["cudaMemcpy2DArrayToArray"] = {"hipMemcpy2DArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaMemcpy2DAsync"] = {"hipMemcpy2DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpy2DFromArray"] = {"hipMemcpy2DFromArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpy2DFromArrayAsync"] = {"hipMemcpy2DFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpy2DToArrayAsync"] = {"hipMemcpy2DToArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; @@ -1043,7 +1043,8 @@ struct cuda2hipMap { cuda2hipRename["cudaMemcpy3DPeerAsync"] = {"hipMemcpy3DPeerAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpyArrayToArray"] = {"hipMemcpyArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpyFromArrayAsync"] = {"hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaMemcpyFromSymbolAsync"] = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaMemcpyFromSymbol"] = {"hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyFromSymbolAsync"] = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME}; // memcpy kind cuda2hipRename["cudaMemcpyKind"] = {"hipMemcpyKind", CONV_MEM, API_RUNTIME}; From 1cb51d614e730f56267e14e30951c5f7781b16e0 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 4 May 2017 06:47:55 +0530 Subject: [PATCH 066/171] hipMemcpy2DAsync for HIP/NVCC Change-Id: I46f0057fef49bdaaac41c1df80c3e27432b8f376 --- include/hip/nvcc_detail/hip_runtime_api.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index 4feefcc342..aad3ffcc44 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -345,7 +345,11 @@ inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolN } inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind){ - return hipCUDAErrorTohipError(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind))); + return hipCUDAErrorTohipError(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind))); +} + +inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { + return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind),stream)); } inline static hipError_t hipMemcpy2DToArray(hipArray *dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind){ From 05be936fd6ee2fa45fbab68cc70765782e7af8fe Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 5 May 2017 21:28:02 +0300 Subject: [PATCH 067/171] [HIPIFY] LLVM 3.9 support 3.8 and 3.9 are both supported. 3.8 is stable, 3.9 needs more testing. --- hipify-clang/CMakeLists.txt | 9 +++++++-- hipify-clang/src/Cuda2Hip.cpp | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt index a02b91407f..872db3defe 100644 --- a/hipify-clang/CMakeLists.txt +++ b/hipify-clang/CMakeLists.txt @@ -6,8 +6,12 @@ set(BUILD_HIPIFY_CLANG 0 PARENT_SCOPE) # Find LLVM package find_package(LLVM 3.8 QUIET PATHS ${HIPIFY_CLANG_LLVM_DIR} NO_DEFAULT_PATH) if (NOT ${LLVM_FOUND}) - message(STATUS "hipify-clang will not be built. To build it please specify absolute path to LLVM (v3.8) package using -DHIPIFY_CLANG_LLVM_DIR") -else() + find_package(LLVM 3.9 QUIET PATHS ${HIPIFY_CLANG_LLVM_DIR} NO_DEFAULT_PATH) + if (NOT ${LLVM_FOUND}) + message(STATUS "hipify-clang will not be built. To build it please specify absolute path to LLVM 3.8 or LLVM 3.9 package using -DHIPIFY_CLANG_LLVM_DIR") + endif() +endif() +if (${LLVM_FOUND}) list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR}) include(AddLLVM) @@ -31,6 +35,7 @@ else() clangSerialization clangSema clangEdit + clangFormat clangLex clangAnalysis clangDriver diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 07fc817b53..1c6406b4ab 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -3533,7 +3533,11 @@ void printAllStats(const std::string &csvFile, int64_t totalFiles, int64_t conve int main(int argc, const char **argv) { auto start = std::chrono::steady_clock::now(); auto begin = start; +#if (LLVM_VERSION_MAJOR >= 3) && (LLVM_VERSION_MINOR >= 9) + llvm::sys::PrintStackTraceOnErrorSignal(StringRef()); +#else llvm::sys::PrintStackTraceOnErrorSignal(); +#endif CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore); std::vector fileSources = OptionsParser.getSourcePathList(); std::string dst = OutputFilename; From 2a253680da5e342f484689b672ead3ba966a6272 Mon Sep 17 00:00:00 2001 From: wsttiger Date: Wed, 3 May 2017 14:21:08 +0000 Subject: [PATCH 068/171] Improve hipStreamWaitEvent test. - use addOne kernel, use local initializer rather than init_array. - use addOneReverse test to add from back of array. Test alternate fwd and backward to stress dependency logic. - check device-side dependencies. --- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 154 ++++++++++++++++-- 1 file changed, 137 insertions(+), 17 deletions(-) diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 637275c381..63c42da557 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -28,7 +28,41 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" #include +#include unsigned p_streams = 6; +unsigned p_db = 0; + + +template +__global__ void +addOne( const T *A_d, + T *C_d, + size_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (size_t i=offset; i +__global__ void +addOneReverse( const T *A_d, + T *C_d, + int64_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { + C_d[i] = A_d[i] + (T)1; + //C_d[i] = (T)1; + } +} //------ @@ -36,49 +70,90 @@ unsigned p_streams = 6; template class Streamer { public: - Streamer(size_t numElements); + Streamer(T *input, size_t numElements, bool reverse); ~Streamer(); - void runAsync(); + void runAsyncAfter(Streamer *depStreamer); + void runAsyncWaitSameStream(); void queryUntilComplete(); + void syncAndCheck(int streamerNum, T initValue, T expectedOffset); + + hipEvent_t event() { return _event; }; + + T *C_d() { return _C_d; }; + private: - T *_A_h; - T *_B_h; T *_C_h; T *_A_d; - T *_B_d; T *_C_d; hipStream_t _stream; hipEvent_t _event; size_t _numElements; + bool _reverse; }; + template -Streamer::Streamer(size_t numElements) : - _numElements(numElements) +Streamer::Streamer(T * A_d, size_t numElements, bool reverse) : + _A_d(A_d), + _numElements(numElements), + _reverse(reverse) { - HipTest::initArrays (&_A_d, &_B_d, &_C_d, &_A_h, &_B_h, &_C_h, numElements, true); + size_t sizeElements = numElements * sizeof(int); + + HIPCHECK(hipMalloc(&_C_d, sizeElements)); + HIPCHECK(hipHostMalloc(&_C_h, sizeElements)); + + HIPCHECK(hipMemset(_C_d, -1, sizeElements)); + HIPCHECK(hipMemset(_C_h, -2, sizeElements)); HIPCHECK(hipStreamCreate(&_stream)); HIPCHECK(hipEventCreate(&_event)); }; + template -void Streamer::runAsync() +void Streamer::runAsyncAfter(Streamer *depStreamer) +{ + if (p_db) { + printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); + } + + if (depStreamer) { + HIPCHECK(hipStreamWaitEvent(_stream, depStreamer->event(), 0)); + } + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); + if (_reverse) { + hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } else { + hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } + HIPCHECK(hipEventRecord(_event, _stream)); +} + + +template +void Streamer::runAsyncWaitSameStream() { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements); + if (_reverse) { + hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } else { + hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } // Test case where hipStreamWaitEvent waits on same event we just placed into the queue. HIPCHECK(hipEventRecord(_event, _stream)); HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); } + template void Streamer::queryUntilComplete() { @@ -89,10 +164,26 @@ void Streamer::queryUntilComplete() e = hipStreamQuery(_stream); } while (e != hipSuccess) ; - printf ("completed after %d queries\n", numQueries); + printf ("info: hipStreamQuery completed after %d queries\n", numQueries); }; +template +void Streamer::syncAndCheck(int streamerNum, T initValue, T expectedOffset) +{ + HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, _stream)); + HIPCHECK(hipStreamSynchronize(_stream)); + + T expected = initValue + expectedOffset; + + for (size_t i=0; i<_numElements; i++) { + if (_C_h[i] != expected) { + failed("for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + } + } +} + + //--- //Parse arguments specific to this test. @@ -122,39 +213,68 @@ int main(int argc, char *argv[]) HipTest::parseStandardArguments(argc, argv, false); parseMyArguments(argc, argv); - typedef Streamer FloatStreamer; + typedef Streamer IntStreamer; - std::vector streamers; + std::vector streamers; size_t numElements = N; + size_t sizeElements = numElements * sizeof(int); + + assert (sizeElements <= std::numeric_limits::max()); + + + int initValue = 1000; + + int * initArray_d, *initArray_h; + HIPCHECK(hipMalloc(&initArray_d, sizeElements)); + HIPCHECK(hipHostMalloc(&initArray_h, sizeElements)); + for (size_t i=0; iC_d() : initArray_d, numElements, i&1 /*reverse?*/); streamers.push_back(s); } if (p_tests & 0x1) { - printf ("==> Test 0x1 runAsnc\n"); + printf ("==> Test 0x1 runAsyncAfter\n"); for (int i=0; irunAsync(); + streamers[i]->runAsyncAfter(i ? streamers[i-1] : NULL); } HIPCHECK(hipDeviceSynchronize()); + + for (int i=0; isyncAndCheck(i+1, initValue, i+1); + } } if (p_tests & 0x2) { printf ("==> Test 0x2 queryUntilComplete\n"); for (int i=0; irunAsync(); + streamers[i]->runAsyncAfter(i ? streamers[i-1] : NULL); streamers[i]->queryUntilComplete(); } HIPCHECK(hipDeviceSynchronize()); } if (p_tests & 0x4) { + printf ("==> Test 0x4 try null stream"); hipStreamQuery(0/* try null stream*/); } + if (p_tests & 0x8) { + printf ("==> Test 0x8 runAsyncWaitSameStream\n"); + for (int i=0; irunAsyncWaitSameStream(); + } + HIPCHECK(hipDeviceSynchronize()); + } + passed(); } From 2d0f509de52096e49afe476abadcfce2cded1244 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 5 May 2017 17:19:37 -0500 Subject: [PATCH 069/171] Update streamEventTEst. - add checks for events across devices. - refactor test to make sure it runs long enough to sensitive sync techniques. - add tests for DeviceSync, streamWaitEvent. --- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 321 ++++++++++++++---- 1 file changed, 250 insertions(+), 71 deletions(-) diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 63c42da557..1d9ec45685 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -29,39 +29,47 @@ THE SOFTWARE. #include "test_common.h" #include #include -unsigned p_streams = 6; +unsigned p_streams = 8; unsigned p_db = 0; +unsigned p_count = 100; + template __global__ void -addOne( const T *A_d, +addCount( const T *A_d, T *C_d, - size_t NELEM) + size_t NELEM, + int count) { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); size_t stride = hipBlockDim_x * hipGridDim_x ; - for (size_t i=offset; i __global__ void -addOneReverse( const T *A_d, +addCountReverse( const T *A_d, T *C_d, - int64_t NELEM) + int64_t NELEM, + int count) { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); size_t stride = hipBlockDim_x * hipGridDim_x ; - for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { - C_d[i] = A_d[i] + (T)1; - //C_d[i] = (T)1; - } + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i=0; i-=stride) { + C_d[i] = A_d[i] + (T)count; + } + } } @@ -70,41 +78,65 @@ addOneReverse( const T *A_d, template class Streamer { public: - Streamer(T *input, size_t numElements, bool reverse); + Streamer(int deviceId, T *input, size_t numElements, bool reverse); ~Streamer(); - void runAsyncAfter(Streamer *depStreamer); + void runAsyncAfter(Streamer *depStreamer, bool waitSameStream=false); void runAsyncWaitSameStream(); void queryUntilComplete(); - void syncAndCheck(int streamerNum, T initValue, T expectedOffset); + size_t check(int streamerNum, T initValue, T expectedOffset, bool expectPass=true); + void copyToHost(hipStream_t copyStream); hipEvent_t event() { return _event; }; + int deviceId() const { return _deviceId; }; + size_t mismatchCount() const { return _mismatchCount; }; T *C_d() { return _C_d; }; private: + T *_C_h; + T *_preA_d; // if input is on another device, this is pointer to that memory. T *_A_d; T *_C_d; hipStream_t _stream; hipEvent_t _event; + int _deviceId; size_t _numElements; bool _reverse; + + size_t _mismatchCount; }; template -Streamer::Streamer(T * A_d, size_t numElements, bool reverse) : +Streamer::Streamer(int deviceId, T * A_d, size_t numElements, bool reverse) : + _preA_d(NULL), _A_d(A_d), + _deviceId(deviceId), _numElements(numElements), _reverse(reverse) { size_t sizeElements = numElements * sizeof(int); + HIPCHECK(hipSetDevice(_deviceId)); + + + hipPointerAttribute_t attr; + HIPCHECK(hipPointerGetAttributes(&attr, A_d)); + if (attr.device != deviceId) { + // source is on another device, we will need to copy later. + // So save original source pointer and allocate local space. + printf ("info: source for streamer on another device, will insert memcpy\n"); + _preA_d = A_d; + HIPCHECK(hipMalloc(&_A_d, sizeElements)); + HIPCHECK(hipMemset(_A_d, -3, sizeElements)); + } + HIPCHECK(hipMalloc(&_C_d, sizeElements)); HIPCHECK(hipHostMalloc(&_C_h, sizeElements)); @@ -113,12 +145,16 @@ Streamer::Streamer(T * A_d, size_t numElements, bool reverse) : HIPCHECK(hipStreamCreate(&_stream)); HIPCHECK(hipEventCreate(&_event)); + + + }; template -void Streamer::runAsyncAfter(Streamer *depStreamer) +void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) { + HIPCHECK(hipSetDevice(_deviceId)); if (p_db) { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); } @@ -127,36 +163,31 @@ void Streamer::runAsyncAfter(Streamer *depStreamer) HIPCHECK(hipStreamWaitEvent(_stream, depStreamer->event(), 0)); } - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - if (_reverse) { - hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); - } else { - hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); - } - HIPCHECK(hipEventRecord(_event, _stream)); -} - - -template -void Streamer::runAsyncWaitSameStream() -{ - printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - if (_reverse) { - hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); - } else { - hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + if (_preA_d) { + // _preA_d is on another device, so copy to local device so kernel can access it: + HIPCHECK(hipMemcpyAsync(_A_d, _preA_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); } - // Test case where hipStreamWaitEvent waits on same event we just placed into the queue. + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); + if (_reverse) { + hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + } else { + hipLaunchKernelGGL(addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + } HIPCHECK(hipEventRecord(_event, _stream)); - HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); + + if (waitSameStream) { + HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); // this is essentially a no-op, but make sure it doesn't crash + } } + template void Streamer::queryUntilComplete() { + HIPCHECK(hipSetDevice(_deviceId)); int numQueries = 0; hipError_t e = hipSuccess; do { @@ -168,19 +199,48 @@ void Streamer::queryUntilComplete() }; +// If copyStream is !nullptr it is used for the copy. template -void Streamer::syncAndCheck(int streamerNum, T initValue, T expectedOffset) +void Streamer::copyToHost(hipStream_t copyStream) { - HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, _stream)); - HIPCHECK(hipStreamSynchronize(_stream)); + if (p_db) { + printf ("db: copy back to host\n"); + } + HIPCHECK(hipSetDevice(_deviceId)); + HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, copyStream ? copyStream : _stream)); + HIPCHECK(hipStreamSynchronize(copyStream ? copyStream:_stream)); +} + + +template +size_t Streamer::check(int streamerNum, T initValue, T expectedOffset, bool expectPass) +{ T expected = initValue + expectedOffset; + if (p_db) { + printf ("db: check\n"); + } + _mismatchCount = 0; for (size_t i=0; i<_numElements; i++) { if (_C_h[i] != expected) { - failed("for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + _mismatchCount++; + if (expectPass) { + fprintf(stderr, "for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + if (_mismatchCount > 10) { + failed("for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + } + } } } + + if (!expectPass && (_mismatchCount ==0)) { + // the test should run kernels long enough that if we don't correctly wait for them to finish then an error is reported. + //failed("for streamer:%d we expected inavalid synchronization to lead to mismatch but none was detected. Increase --N to sensitize sync.\n", streamerNum); + + } + + return _mismatchCount; } @@ -189,6 +249,8 @@ void Streamer::syncAndCheck(int streamerNum, T initValue, T expectedOffset) //Parse arguments specific to this test. void parseMyArguments(int argc, char *argv[]) { + N = 64*1024*1024; + int more_argc = HipTest::parseStandardArguments(argc, argv, false); // parse args for this test: @@ -199,6 +261,14 @@ void parseMyArguments(int argc, char *argv[]) if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { failed("Bad streams argument"); } + } else if (!strcmp(arg, "--count")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_count)) { + failed("Bad count argument"); + } + } else if (!strcmp(arg, "--db")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_db)) { + failed("Bad db argument"); + } } else { failed("Bad argument '%s'", arg); } @@ -206,6 +276,91 @@ void parseMyArguments(int argc, char *argv[]) }; +typedef Streamer IntStreamer; + + + + +void runStreamerLoop(std::vector &streamers) +{ + for (int i=0; irunAsyncAfter(i ? streamers[i-1] : NULL); + } +} + + +void checkAll(int initValue, std::vector &streamers, std::vector &sideStreams, bool expectPass=true) +{ + size_t mismatchCount=0; + + // Copy in reverse order to catch anything not yet finished... + for (int i=streamers.size()-1; i>=0; i--) { + streamers[i]->copyToHost(sideStreams.empty() ? NULL : sideStreams[streamers[i]->deviceId()]); + } + + + // Check in forward order so we can find first mismatch: + for (int i=0; icheck(i+1, initValue, (i+1)*p_count, expectPass); + + } + if (!expectPass && (mismatchCount==0)) { + // the test should run kernels long enough that if we don't correctly wait for them to finish then an error is reported. + failed("we expected inavalid synchronization to lead to mismatch but none was detected. Increase --count to sensitize sync.\n"); + } + +} + + + +#define RUN_SYNC_TEST(_enableBit, _streamers, _sync, _expectPass)\ + if (p_tests & (_enableBit)) {\ + printf ("==> Test %02x runAsyncAfter sync=%s\n", (_enableBit), #_sync);\ + runStreamerLoop(_streamers);\ + (_sync);\ + checkAll (initValue, _streamers, sideStreams, _expectPass);\ + } + + + + +//--- +// A family of sync functions which somehow wait for inflight activity to finish: + + +void sync_none(void) {}; + +void sync_allDevices(int numDevices) +{ + for (int d=0; d streamers) +{ + for (int i=0; iqueryUntilComplete(); + }; +} + + +void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) +{ + HIPCHECK(hipSetDevice(sideDeviceId)); + + // wait on the last event in the stream of chained streamers: + // This plants a marker which the subsquent copy for this device will wait on: + HIPCHECK(hipStreamWaitEvent(sideStream, lastEvent, 0)); + + if (waitHere) { + HIPCHECK(hipStreamSynchronize(sideStream)); + } +} + + //--- int main(int argc, char *argv[]) @@ -213,13 +368,17 @@ int main(int argc, char *argv[]) HipTest::parseStandardArguments(argc, argv, false); parseMyArguments(argc, argv); - typedef Streamer IntStreamer; + std::vector streamers; + std::vector streamersDev0; // streamers for first device. size_t numElements = N; size_t sizeElements = numElements * sizeof(int); + printf("info: sizeof arrays = %zu elements (%6.3f MB)\n", numElements, sizeElements / 1024.0/1024.0); + printf("info: streams=%d count=%d\n", p_streams, p_count); + assert (sizeElements <= std::numeric_limits::max()); @@ -234,45 +393,65 @@ int main(int argc, char *argv[]) HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice)); + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + numDevices =2; // TODO - remove me. - for (int i=0; iC_d() : initArray_d, numElements, i&1 /*reverse?*/); - streamers.push_back(s); - } - - if (p_tests & 0x1) { - printf ("==> Test 0x1 runAsyncAfter\n"); + for (int d=0; drunAsyncAfter(i ? streamers[i-1] : NULL); - } - HIPCHECK(hipDeviceSynchronize()); - - for (int i=0; isyncAndCheck(i+1, initValue, i+1); + IntStreamer * s = new IntStreamer(d, i ? streamers.back()->C_d() : initArray_d, numElements, i&1 /*reverse?*/); + streamers.push_back(s); + if (d==0) { + streamersDev0.push_back(s); + } } } - if (p_tests & 0x2) { - printf ("==> Test 0x2 queryUntilComplete\n"); - for (int i=0; irunAsyncAfter(i ? streamers[i-1] : NULL); - streamers[i]->queryUntilComplete(); - } - HIPCHECK(hipDeviceSynchronize()); + // A sideband stream channel that is independent from above. + // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is + // asynchronous wrt the other streams. + std::vector sideStreams; + for (int d=0; d Test 0x4 try null stream"); + + // Tests on first GPU: + RUN_SYNC_TEST(0x01, streamersDev0, sync_none(), false); + RUN_SYNC_TEST(0x02, streamersDev0, sync_allDevices(numDevices), true); + RUN_SYNC_TEST(0x04, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); + RUN_SYNC_TEST(0x08, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); + + if (numDevices > 1) { + // Sync on second device for activity running on device 0: + RUN_SYNC_TEST(0x10, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 1, sideStreams[1], true), true); + } + + + // Tests on all GPUs: + // RUN_SYNC_TEST(0x100, streamers, sync_streamWaitEvent(streamers.back()->event(), 0, sideStreams[0], false), true); + + + + + if (p_tests & 0x1000) { + printf ("==> Test 0x1000 try null stream\n"); hipStreamQuery(0/* try null stream*/); } - if (p_tests & 0x8) { - printf ("==> Test 0x8 runAsyncWaitSameStream\n"); - for (int i=0; irunAsyncWaitSameStream(); + + // Insert small wrinkle here, insert a wait on event just recorded, all in the same stream. + if (p_tests & 0x2000) { + printf ("==> Test 0x2000 runAsyncWaitSameStream\n"); + for (int i=0; irunAsyncAfter(i ? streamersDev0[i-1] : NULL, true/*waitSameStream*/); } - HIPCHECK(hipDeviceSynchronize()); + + sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false); + checkAll (initValue, streamersDev0, sideStreams); } From 6437f5d2b20f8c6e204cc35921dc1329a3d9dfb2 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 5 May 2017 17:20:56 -0500 Subject: [PATCH 070/171] Refactor hipHostRegister test. Run all tests in one command. Run 128 offsets. --- tests/src/runtimeApi/memory/hipHostRegister.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipHostRegister.cpp b/tests/src/runtimeApi/memory/hipHostRegister.cpp index 8cf0979261..3376ee04f1 100644 --- a/tests/src/runtimeApi/memory/hipHostRegister.cpp +++ b/tests/src/runtimeApi/memory/hipHostRegister.cpp @@ -19,9 +19,7 @@ THE SOFTWARE. /* HIT_START * BUILD: %t %s ../../test_common.cpp - * RUN: %t --tests 0x1 - * RUN: %t --tests 0x2 - * RUN: %t --tests 0x4 + * RUN: %t * HIT_END */ @@ -131,7 +129,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipMalloc(&Bd, size)); // TODO - set to 128 -#define OFFSETS_TO_TRY 1 +#define OFFSETS_TO_TRY 128 assert (N>OFFSETS_TO_TRY); if (p_tests & 0x2) { From 687809104b93c89904146cb174af971d9a301c7d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 5 May 2017 17:28:11 -0500 Subject: [PATCH 071/171] Fix some typos, add additional guidance for -BSymbolic --- docs/markdown/hip_bugs.md | 2 ++ src/hip_memory.cpp | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/markdown/hip_bugs.md b/docs/markdown/hip_bugs.md index abb31d80e8..91b2a5a019 100644 --- a/docs/markdown/hip_bugs.md +++ b/docs/markdown/hip_bugs.md @@ -45,6 +45,8 @@ To correct, add the following flag to hcc or hipcc: $ hipcc -Wl,-Bsymbolic ... ``` +Ensure there is no space in the "Wl,-Bsymbolic" option. + ### What is the current limitation of HIP Generic Grid Launch method? 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak))); diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index c4bc7db096..1ba698f461 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -104,8 +104,8 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig auto device = ctx->getWriteableDevice(); ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags); - tprintf(DB_MEM, " alloc %s ptr:%p size:%zu on dev:%d\n", - msg, ptr, sizeBytes, device->_deviceId); + tprintf(DB_MEM, " alloc %s ptr:%p-%p size:%zu on dev:%d\n", + msg, ptr, static_cast(ptr)+sizeBytes, sizeBytes, device->_deviceId); if (ptr != nullptr) { int r = sharePtr(ptr, ctx, hipFlags); From a38e36ec2f3905a3701619b6e72c23a39c9709c1 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 9 May 2017 10:14:16 -0500 Subject: [PATCH 072/171] added guard against hip_runtime.h so that non-hcc compilers can use it Change-Id: I3d68deda9ce8a5956e21e15a69e549d6c21e3e39 --- include/hip/hcc_detail/hip_runtime.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 06ce65bc9a..4d8876d8f4 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -41,6 +41,8 @@ THE SOFTWARE. #include #endif//__cplusplus +#if __HCC__ + // Define NVCC_COMPAT for CUDA compatibility #define NVCC_COMPAT #define CUDA_SUCCESS hipSuccess @@ -481,6 +483,6 @@ do {\ */ - +#endif #endif//HIP_HCC_DETAIL_RUNTIME_H From e0c3ea15b294ba8d6f1404b32a443489db38d6ca Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 10 May 2017 13:23:49 -0500 Subject: [PATCH 073/171] Fix hipStreamWaitEvent for single GPU. --- tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 1d9ec45685..adf0d4af0c 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -395,7 +395,7 @@ int main(int argc, char *argv[]) int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); - numDevices =2; // TODO - remove me. + numDevices = min(2, numDevices); // multi-GPU to 2 device. for (int d=0; d Date: Wed, 10 May 2017 17:32:25 -0500 Subject: [PATCH 074/171] hipHostMalloc allocation are mapped to all devices by default. Support hipHostMallocPortable flag. Default flags are hipHostMallocPortable | hipHostMallocMapped. Also: -refactor tests to move addCount and addCountReverse into HipTest namespace. -test multi-GPU host memory. --- src/hip_hcc.cpp | 9 ++ src/hip_hcc_internal.h | 1 + src/hip_memory.cpp | 58 ++++---- tests/src/hipPointerAttrib.cpp | 2 +- .../runtimeApi/memory/hipMemoryAllocate.cpp | 129 +++++++++++++----- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 40 +----- tests/src/test_common.h | 38 ++++++ 7 files changed, 181 insertions(+), 96 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 71d947488d..81a2079b5b 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -118,6 +118,7 @@ bool g_visible_device = false; unsigned g_deviceCnt; std::vector g_hip_visible_devices; hsa_agent_t g_cpu_agent; +hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents. unsigned g_numLogicalThreads; std::atomic g_lastShortTid(1); @@ -1389,6 +1390,14 @@ void ihipInit() g_deviceCnt++; } } + + g_allAgents = static_cast (malloc((g_deviceCnt+1) * sizeof(hsa_agent_t))); + g_allAgents[0] = g_cpu_agent; + for (int i=0; i_hsaAgent; + } + + g_numLogicalThreads = std::thread::hardware_concurrency(); // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 9c17c6e98c..132f099ce8 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -826,6 +826,7 @@ private: // Critical data, protected with locked access: extern std::once_flag hip_initialized; extern unsigned g_deviceCnt; extern hsa_agent_t g_cpu_agent ; // the CPU agent. +extern hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents. //================================================================================================= // Extern functions: diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 1ba698f461..c4f0f64e50 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -59,31 +59,40 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK } // return 0 on success or -1 on error: -int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags) +int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags) { int ret = 0; auto device = ctx->getWriteableDevice(); hc::am_memtracker_update(ptr, device->_deviceId, hipFlags); - int peerCnt=0; - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - // the peerCnt always stores self so make sure the trace actually - peerCnt = crit->peerCnt(); - tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1); - if (peerCnt > 1) { - //printf ("peer self access\n"); + if (shareWithAll) { + hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr); + tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); + if (s != HSA_STATUS_SUCCESS) { + ret = -1; + } + } else { + int peerCnt=0; + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + // the peerCnt always stores self so make sure the trace actually + peerCnt = crit->peerCnt(); + tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1); + if (peerCnt > 1) { - // TODOD - remove me: - for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { - tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); - }; + //printf ("peer self access\n"); - hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); - if (s != HSA_STATUS_SUCCESS) { - ret = -1; + // TODOD - remove me: + for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { + tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); + }; + + hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); + if (s != HSA_STATUS_SUCCESS) { + ret = -1; + } } } } @@ -96,7 +105,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags) // Allocate a new pointer with am_alloc and share with all valid peers. // Returns null-ptr if a memory error occurs (either allocation or sharing) -void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags) +void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool shareWithAll, unsigned amFlags, unsigned hipFlags) { void *ptr = nullptr; @@ -108,7 +117,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig msg, ptr, static_cast(ptr)+sizeBytes, sizeBytes, device->_deviceId); if (ptr != nullptr) { - int r = sharePtr(ptr, ctx, hipFlags); + int r = sharePtr(ptr, ctx, shareWithAll, hipFlags); if (r != 0) { ptr = nullptr; } @@ -220,7 +229,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) } else { auto device = ctx->getWriteableDevice(); - *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/); + *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false/*shareWithAll*/, 0/*amFlags*/, 0/*hipFlags*/); if(sizeBytes && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; @@ -253,7 +262,8 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } else { unsigned trueFlags = flags; if (flags == hipHostMallocDefault) { - trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined; + // HCC/ROCM provide a modern system with unified memory and should set both of these flags by default: + trueFlags = hipHostMallocMapped | hipHostMallocPortable; } const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined; @@ -265,8 +275,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) auto device = ctx->getWriteableDevice(); unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host", - sizeBytes, ctx, amFlags, flags); + sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags); + if(sizeBytes && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; } @@ -314,7 +326,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height auto device = ctx->getWriteableDevice(); const unsigned am_flags = 0; - *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0); + *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, false/*shareWithAll*/, am_flags, 0); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; @@ -373,7 +385,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, hip_status = hipErrorUnknown; break; } - *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0); + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0); if (size && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } diff --git a/tests/src/hipPointerAttrib.cpp b/tests/src/hipPointerAttrib.cpp index fb7832d9a6..7a2ab64bea 100644 --- a/tests/src/hipPointerAttrib.cpp +++ b/tests/src/hipPointerAttrib.cpp @@ -99,7 +99,7 @@ inline int zrand(int max) //================================================================================================= -// Functins to run tests +// Functions to run tests //================================================================================================= //-- //Run through a couple simple cases to test lookups and host pointer arithmetic: diff --git a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index 0a256d6362..1ee5cbc9bb 100644 --- a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -25,45 +25,106 @@ THE SOFTWARE. #include"test_common.h" -#define SIZE 1024*1024*256 +#define NUM_ELEMENTS 1024*1024*64 +#define SIZE NUM_ELEMENTS*sizeof(int) -int main(){ - float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0; - B = (float*)malloc(SIZE); - hipMalloc((void**)&Ad, SIZE); - hipHostMalloc((void**)&B, SIZE); - hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault); - hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped); - hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped); - - hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/); - - HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess); - - HIPCHECK_API(hipFree(Ad) , hipSuccess); - HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue); - - HIPCHECK_API(hipFree(B) , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory - HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipFree(ptr_0) , hipSuccess); - HIPCHECK_API(hipHostFree(Bd) , hipSuccess); - HIPCHECK_API(hipHostFree(Bm) , hipSuccess); - - HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipHostFree(C) , hipSuccess); +int p_count = 4; - HIPCHECK_API(hipFree(NULL) , hipSuccess); - HIPCHECK_API(hipHostFree(NULL) , hipSuccess); +void multiGpuHostAlloc(int allocDevice) +{ + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + assert(numDevices > 1); + + printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices); + + + HIPCHECK(hipSetDevice(allocDevice)); + + int *Ah, *Ch; + hipHostMalloc((void**)&Ah, SIZE); + hipHostMalloc((void**)&Ch, SIZE); + + const int init = -1; + for (size_t i=0; i 1); + + multiGpuHostAlloc(0); + multiGpuHostAlloc(1); } passed(); diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index adf0d4af0c..80ff7ad98d 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -35,42 +35,6 @@ unsigned p_count = 100; -template -__global__ void -addCount( const T *A_d, - T *C_d, - size_t NELEM, - int count) -{ - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; - - // Deliberately do this in an inefficient way to increase kernel runtime - for (int i=0; i -__global__ void -addCountReverse( const T *A_d, - T *C_d, - int64_t NELEM, - int count) -{ - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; - - // Deliberately do this in an inefficient way to increase kernel runtime - for (int i=0; i=0; i-=stride) { - C_d[i] = A_d[i] + (T)count; - } - } -} //------ @@ -171,9 +135,9 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); if (_reverse) { - hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); } else { - hipLaunchKernelGGL(addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); } HIPCHECK(hipEventRecord(_event, _stream)); diff --git a/tests/src/test_common.h b/tests/src/test_common.h index 1250de4801..633ee6f825 100644 --- a/tests/src/test_common.h +++ b/tests/src/test_common.h @@ -146,6 +146,44 @@ vectorADD(hipLaunchParm lp, } +template +__global__ void +addCount( const T *A_d, + T *C_d, + size_t NELEM, + int count) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i +__global__ void +addCountReverse( const T *A_d, + T *C_d, + int64_t NELEM, + int count) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i=0; i-=stride) { + C_d[i] = A_d[i] + (T)count; + } + } +} + + template void initArraysForHost(T **A_h, T **B_h, T **C_h, size_t N, bool usePinnedHost=false) From 25d470c3801a82054461972d7b2f478115920647 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 11 May 2017 21:50:36 +0300 Subject: [PATCH 075/171] [HIPIFY] Fix string routines. Some Clang tooling functions return std::string, some return StringRef. Assigning of returning std::string to StringRef variables leads to garbage in it. DEBUG build is always affected. --- hipify-clang/src/Cuda2Hip.cpp | 51 +++++++++++++++++------------------ 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 1c6406b4ab..47434babac 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -2690,7 +2690,7 @@ private: bool cudaCall(const MatchFinder::MatchResult &Result) { if (const CallExpr *call = Result.Nodes.getNodeAs("cudaCall")) { const FunctionDecl *funcDcl = call->getDirectCallee(); - StringRef name = funcDcl->getDeclName().getAsString(); + std::string name = funcDcl->getDeclName().getAsString(); SourceManager *SM = Result.SourceManager; SourceLocation sl = call->getLocStart(); const auto found = N.cuda2hipRename.find(name); @@ -2714,16 +2714,16 @@ private: } } if (bReplace) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); Replacement Rep(*SM, sl, length, repName); FullSourceLoc fullSL(sl, *SM); insertReplacement(Rep, fullSL); } } else { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [function call]."; + std::string msg = "the following reference is not handled: '" + name + "' [function call]."; printHipifyMessage(*SM, sl, msg); } return true; @@ -2838,7 +2838,7 @@ private: bool cudaEnumConstantRef(const MatchFinder::MatchResult &Result) { if (const DeclRefExpr *enumConstantRef = Result.Nodes.getNodeAs("cudaEnumConstantRef")) { - StringRef name = enumConstantRef->getDecl()->getNameAsString(); + StringRef name = enumConstantRef->getDecl()->getName(); SourceLocation sl = enumConstantRef->getLocStart(); SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); @@ -2861,10 +2861,10 @@ private: bool cudaEnumDecl(const MatchFinder::MatchResult &Result) { if (const VarDecl *enumDecl = Result.Nodes.getNodeAs("cudaEnumDecl")) { - StringRef name = enumDecl->getType()->getAsTagDecl()->getNameAsString(); + std::string name = enumDecl->getType()->getAsTagDecl()->getNameAsString(); QualType QT = enumDecl->getType().getUnqualifiedType(); - StringRef name_unqualified = QT.getAsString(); - if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { + std::string name_unqualified = QT.getAsString(); + if ((name_unqualified.find(' ') == std::string::npos && name.find(' ') == std::string::npos) || name.empty()) { name = name_unqualified; } // Workaround for enum VarDecl as param decl, declared with enum type specifier @@ -2882,7 +2882,7 @@ private: //------------------------------------------------- const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2890,7 +2890,7 @@ private: insertReplacement(Rep, fullSL); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [enum constant decl]."; + std::string msg = "the following reference is not handled: '" + name + "' [enum constant decl]."; printHipifyMessage(*SM, sl, msg); } return true; @@ -2905,12 +2905,12 @@ private: QT = QT.getTypePtr()->getAsArrayTypeUnsafe()->getElementType(); } QT = QT.getUnqualifiedType(); - StringRef name = QT.getAsString(); + std::string name = QT.getAsString(); SourceLocation sl = typedefVar->getLocStart(); SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2918,7 +2918,7 @@ private: insertReplacement(Rep, fullSL); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [typedef var]."; + std::string msg = "the following reference is not handled: '" + name + "' [typedef var]."; printHipifyMessage(*SM, sl, msg); } return true; @@ -2935,10 +2935,10 @@ private: SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); QualType QT = t->getPointeeType(); QT = QT.getUnqualifiedType(); - StringRef name = QT.getAsString(); + std::string name = QT.getAsString(); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2947,7 +2947,7 @@ private: } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [typedef var ptr]."; + std::string msg = "the following reference is not handled: '" + name + "' [typedef var ptr]."; printHipifyMessage(*SM, sl, msg); } } @@ -2961,13 +2961,13 @@ private: QualType QT = structVar->getType(); // ToDo: find case-studies with types other than Struct. if (QT->isStructureType()) { - StringRef name = QT.getTypePtr()->getAsStructureType()->getDecl()->getNameAsString(); + std::string name = QT.getTypePtr()->getAsStructureType()->getDecl()->getNameAsString(); TypeLoc TL = structVar->getTypeSourceInfo()->getTypeLoc(); SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2976,7 +2976,7 @@ private: } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [struct var]."; + std::string msg = "the following reference is not handled: '" + name + "' [struct var]."; printHipifyMessage(*SM, sl, msg); } } @@ -3049,7 +3049,7 @@ private: // Example: extern __shared__ uint sRadix1[]; if (sharedVar->hasExternalFormalLinkage()) { QualType QT = sharedVar->getType(); - StringRef typeName; + std::string typeName; if (QT->isIncompleteArrayType()) { const ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); QT = AT->getElementType(); @@ -3071,9 +3071,8 @@ private: SourceLocation slEnd = sharedVar->getLocEnd(); SourceManager *SM = Result.SourceManager; size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1; - SmallString<128> tmpData; - StringRef varName = sharedVar->getNameAsString(); - StringRef repName = Twine("HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")").toStringRef(tmpData); + std::string varName = sharedVar->getNameAsString(); + std::string repName = "HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")"; Replacement Rep(*SM, slStart, repLength, repName); FullSourceLoc fullSL(slStart, *SM); insertReplacement(Rep, fullSL); @@ -3089,7 +3088,7 @@ private: bool cudaParamDecl(const MatchFinder::MatchResult &Result) { if (const ParmVarDecl *paramDecl = Result.Nodes.getNodeAs("cudaParamDecl")) { QualType QT = paramDecl->getOriginalType().getUnqualifiedType(); - StringRef name = QT.getAsString(); + std::string name = QT.getAsString(); const Type *t = QT.getTypePtr(); if (t->isStructureOrClassType()) { name = t->getAsCXXRecordDecl()->getName(); @@ -3099,7 +3098,7 @@ private: SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -3107,7 +3106,7 @@ private: insertReplacement(Rep, fullSL); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [param decl]."; + std::string msg = "the following reference is not handled: '" + name + "' [param decl]."; printHipifyMessage(*SM, sl, msg); } return true; From b306095ac28d7e399487b5d72df7ed7fcd340150 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 15:57:32 -0500 Subject: [PATCH 076/171] Add hipEventDisableSystemRelease flag. --- include/hip/hcc_detail/hip_runtime_api.h | 1 + include/hip/nvcc_detail/hip_runtime_api.h | 2 ++ src/hip_hcc.cpp | 10 +++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 9cfd21c1d2..175fd64d29 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -106,6 +106,7 @@ enum hipLimit_t #define hipEventBlockingSync 0x1 ///< Waiting will yield CPU. Power-friendly and usage-friendly but may increase latency. #define hipEventDisableTiming 0x2 ///< Disable event's capability to record timing information. May improve performance. #define hipEventInterprocess 0x4 ///< Event can support IPC. @warning - not supported in HIP. +#define hipEventDisableSystemRelease 0x80000000 /// < Disable the system-scope release that event normally performs when it records. This flag is useful to obtain more precise timings of commands between events. The flag is a no-op on CUDA platforms. //! Flags that can be used with hipHostMalloc diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index aad3ffcc44..e9f926b336 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -58,6 +58,8 @@ hipMemcpyHostToHost #define hipEventBlockingSync cudaEventBlockingSync #define hipEventDisableTiming cudaEventDisableTiming #define hipEventInterprocess cudaEventInterprocess +#define hipEventDisableSystemRelease cudaEventDefault /* no-op on CUDA platform */ + #define hipHostMallocDefault cudaHostAllocDefault #define hipHostMallocPortable cudaHostAllocPortable diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 81a2079b5b..e936cf3af3 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -56,6 +56,9 @@ THE SOFTWARE. #define USE_ROCR_1_4 1 #endif +// needs HCC change for hc::no_scope +#define USE_NO_SCOPE 0 + //================================================================================================= //Global variables: //================================================================================================= @@ -364,8 +367,13 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) LockedAccessor_StreamCrit_t crit(_criticalData); this->ensureHaveQueue(crit); +#if USE_NO_SCOPE + printf ("create_marker, flags = %x\n", event->_flags); + event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); +#else event->_marker = crit->_av.create_marker(); -} +#endif +}; //============================================================================= From 0679831384072c4f9ddeff128f0c284c31887de8 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 16:05:28 -0500 Subject: [PATCH 077/171] Remove old USE_ switches no longer needed. --- src/hip_hcc.cpp | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index e936cf3af3..12f1792c33 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -48,14 +48,6 @@ THE SOFTWARE. #include "env.h" -#ifndef USE_COPY_EXT_V2 -#define USE_COPY_EXT_V2 1 -#endif - -#ifndef USE_ROCR_1_4 -#define USE_ROCR_1_4 1 -#endif - // needs HCC change for hc::no_scope #define USE_NO_SCOPE 0 @@ -107,10 +99,6 @@ int HCC_OPT_FLUSH = 0; -#define HIP_USE_PRODUCT_NAME 1 -//#define DISABLE_COPY_EXT 1 - - std::once_flag hip_initialized; // Array of pointers to devices. @@ -857,11 +845,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) // Get Max Threads Per Multiprocessor uint32_t max_waves_per_cu; -#if USE_ROCR_1_4 err = hsa_agent_get_info(_hsaAgent,(hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, &max_waves_per_cu); -#else - max_waves_per_cu = 10; -#endif DeviceErrorCheck(err); prop-> maxThreadsPerMultiProcessor = prop->warpSize*max_waves_per_cu; @@ -1919,11 +1903,7 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, this->ensureHaveQueue(crit); -#if USE_COPY_EXT_V2 crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); -#else - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); -#endif } } @@ -2031,18 +2011,10 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes this->ensureHaveQueue(crit); if (HIP_FORCE_SYNC_COPY) { -#if USE_COPY_EXT_V2 crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc, forceUnpinnedCopy); -#else - crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); -#endif } else { -#if USE_COPY_EXT_V2 crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc); -#else - crit->_av.copy_async(src, dst, sizeBytes); -#endif } } catch (Kalmar::runtime_exception) { throw ihipException(hipErrorRuntimeOther); @@ -2075,11 +2047,7 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes this->ensureHaveQueue(crit); -#if USE_COPY_EXT_V2 crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); -#else - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); -#endif } } } From a43149135e104cc03e582a68060f29374d080106 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 12 May 2017 21:43:34 -0500 Subject: [PATCH 078/171] added gfx900 to hipDeviceProp_t Change-Id: I49e7a32f218926fd55f1c94c5dc2366d6c8ac4ca --- src/hip_hcc.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 12f1792c33..a655e35aa1 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -772,6 +772,9 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) if(strcmp(archName,"gfx803")==0){ prop->gcnArch = 803; } + if(strcmp(archName,"gfx900")==0){ + prop->gcnArch = 900; + } DeviceErrorCheck(err); From a97cb6810cd6a1aa3af708f6726358ebecb2ed80 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 15 May 2017 15:35:52 +0300 Subject: [PATCH 079/171] [HIPIFY] CUDA Driver API: Primary Context Management support. --- hipify-clang/src/Cuda2Hip.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 47434babac..72cbcaf52a 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -858,6 +858,13 @@ struct cuda2hipMap { cuda2hipRename["cuCtxSetLimit"] = {"hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuCtxGetLimit"] = {"hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + // Primary Context Management + cuda2hipRename["cuDevicePrimaryCtxGetState"] = {"hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxRelease"] = {"hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxRetain"] = {"hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxReset"] = {"hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxSetFlags"] = {"hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER}; + // Device cuda2hipRename["cuDeviceGet"] = {"hipGetDevice", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetName"] = {"hipDeviceGetName", CONV_DEV, API_DRIVER}; From 1223612331678dae749aec6a125efdf5ccd9e007 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 16 May 2017 07:15:13 +0530 Subject: [PATCH 080/171] Added hipMallocPitch on HIP/NVCC path Change-Id: Ie3ba7d3f95acac23805efa919531043b350a3f21 --- include/hip/nvcc_detail/hip_runtime_api.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index e9f926b336..69a9b46570 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -204,6 +204,10 @@ inline static hipError_t hipMalloc(void** ptr, size_t size) { return hipCUDAErrorTohipError(cudaMalloc(ptr, size)); } +inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) { + return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height)); +} + inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); } From 12d8c53c900b7c8c3ed49d1cef80618c59f6860e Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 16 May 2017 18:21:25 +0300 Subject: [PATCH 081/171] [HIPIFY] cudaMallocPitch -> hipMallocPitch --- hipify-clang/src/Cuda2Hip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 72cbcaf52a..0c6b0f1efc 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -1088,7 +1088,7 @@ struct cuda2hipMap { cuda2hipRename["cudaMalloc3DArray"] = {"hipMalloc3DArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMallocManaged"] = {"hipMallocManaged", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMallocMipmappedArray"] = {"hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaMallocPitch"] = {"hipMallocPitch", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaMallocPitch"] = {"hipMallocPitch", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaFree"] = {"hipFree", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaFreeHost"] = {"hipHostFree", CONV_MEM, API_RUNTIME}; From 30000ef130cdd0d086812f0693af36af6c44fe75 Mon Sep 17 00:00:00 2001 From: emankov Date: Tue, 16 May 2017 19:52:39 +0300 Subject: [PATCH 082/171] [HIPIFY] *.inl extension support for batch processing --- bin/findcode.sh | 2 +- bin/hipexamine.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/findcode.sh b/bin/findcode.sh index a2334b3e2d..d092d6bf8d 100755 --- a/bin/findcode.sh +++ b/bin/findcode.sh @@ -2,4 +2,4 @@ SEARCH_DIRS=$@ -find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp' +find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp' -o -name '*.inl' diff --git a/bin/hipexamine.sh b/bin/hipexamine.sh index 2a6fab7110..79e1b469f9 100755 --- a/bin/hipexamine.sh +++ b/bin/hipexamine.sh @@ -1,6 +1,6 @@ #!/bin/bash -#usage : hipexamine2.sh DIRNAME [hipify options] [--] [clang options] +#usage : hipexamine.sh DIRNAME [hipify options] [--] [clang options] # Generate CUDA->HIP conversion statistics for all the code files in the specified directory. From 0edab14139c564781e85febb603957a09590f9b6 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 27 Jan 2017 11:21:08 -0600 Subject: [PATCH 083/171] Add HIP_TRACE_API=4. Only display memory allocation/free apis. --- README.md | 1 + docs/markdown/hip_profiling.md | 5 ++++ src/hip_hcc.cpp | 2 +- src/hip_hcc_internal.h | 16 ++++++------ src/hip_memory.cpp | 46 +++++++++++++++++----------------- 5 files changed, 39 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index f61c3b106a..d54032c6df 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ HIP releases are typically of two types. The tag naming convention is different - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) - [HIP Porting Guide](docs/markdown/hip_porting_guide.md) - [HIP Porting Driver Guide](docs/markdown/hip_porting_driver_api.md) +- [HIP Profiling and Debugging](docs/markdown/hip_profiling.md) - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) - [hipify-clang](hipify-clang/README.md) - [Developer/CONTRIBUTING Info](CONTRIBUTING.md) diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md index 6e5cde700d..b5c4672464 100644 --- a/docs/markdown/hip_profiling.md +++ b/docs/markdown/hip_profiling.md @@ -267,6 +267,11 @@ info: check result PASSED! ``` +HIP_TRACE_API supports multiple levels of debug information: + - 0x1 = print all HIP APIs + - 0x2 = print HIP APIs which initiate GPU kernels, copies, or memsets. Includes hipLaunchKernel, hipMemcpy*, hipMemset*. + - 0x4 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree. + #### Color Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index a655e35aa1..07604fe85d 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -1435,7 +1435,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream) { - if ((HIP_TRACE_API & (1< dpitch || width > spitch) return ihipLogStatus(hipErrorUnknown); @@ -826,7 +826,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { - HIP_INIT_CMD_API(dst, wOffset, hOffset, src, spitch, width, height, kind); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, spitch, width, height, kind); hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); @@ -879,7 +879,7 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind) { - HIP_INIT_CMD_API(dst, wOffset, hOffset, src, count, kind); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, count, kind); hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); @@ -938,7 +938,7 @@ ihipMemsetKernel(hipStream_t stream, // TODO-sync: function is async unless target is pinned host memory - then these are fully sync. hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream ) { - HIP_INIT_CMD_API(dst, value, sizeBytes, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes, stream); hipError_t e = hipSuccess; @@ -988,7 +988,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) { - HIP_INIT_CMD_API(dst, value, sizeBytes); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes); hipError_t e = hipSuccess; @@ -1148,7 +1148,7 @@ hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) hipError_t hipFree(void* ptr) { - HIP_INIT_API(ptr); + HIP_INIT_SPECIAL_API((TRACE_MEM), ptr); hipError_t hipStatus = hipErrorInvalidDevicePointer; @@ -1176,7 +1176,7 @@ hipError_t hipFree(void* ptr) hipError_t hipHostFree(void* ptr) { - HIP_INIT_API(ptr); + HIP_INIT_SPECIAL_API((TRACE_MEM), ptr); // Synchronize to ensure all work has finished. ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish. @@ -1210,7 +1210,7 @@ hipError_t hipFreeHost(void* ptr) hipError_t hipFreeArray(hipArray* array) { - HIP_INIT_API(array); + HIP_INIT_SPECIAL_API((TRACE_MEM), array); hipError_t hipStatus = hipErrorInvalidDevicePointer; From db097ab392a0aad6e3407b0d33d9d2d78638c28a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 14 Feb 2017 21:50:16 -0600 Subject: [PATCH 084/171] split debugging into separate .md file --- README.md | 3 +- tests/src/runtimeApi/stream/hipNullStream.cpp | 200 ++++++++++++++++++ 2 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 tests/src/runtimeApi/stream/hipNullStream.cpp diff --git a/README.md b/README.md index d54032c6df..d04d63714f 100644 --- a/README.md +++ b/README.md @@ -32,7 +32,8 @@ HIP releases are typically of two types. The tag naming convention is different - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) - [HIP Porting Guide](docs/markdown/hip_porting_guide.md) - [HIP Porting Driver Guide](docs/markdown/hip_porting_driver_api.md) -- [HIP Profiling and Debugging](docs/markdown/hip_profiling.md) +- [HIP Profiling ](docs/markdown/hip_profiling.md) +- [HIP Debugging](docs/markdown/hip_debugging.md) - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) - [hipify-clang](hipify-clang/README.md) - [Developer/CONTRIBUTING Info](CONTRIBUTING.md) diff --git a/tests/src/runtimeApi/stream/hipNullStream.cpp b/tests/src/runtimeApi/stream/hipNullStream.cpp new file mode 100644 index 0000000000..f8d201cb51 --- /dev/null +++ b/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -0,0 +1,200 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + + +#include "hip/hip_runtime.h" +#include "test_common.h" +#include +unsigned p_streams = 6; +int p_repeat = 10; + + +template +__global__ void +vectorADDRepeat(hipLaunchParm lp, + const T *A_d, + const T *B_d, + T *C_d, + size_t NELEM, + int repeat) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int j=1; j<=repeat;j++) { + for (size_t i=offset; i +class Streamer { +public: + Streamer(size_t numElements, bool useNullStream=false); + ~Streamer(); + void enqueAsync(); + void queryUntilComplete(); + + +public: + T *_A_h; + T *_B_h; + T *_C_h; + + T *_A_d; + T *_B_d; + T *_C_d; + + hipStream_t _stream; + hipEvent_t _event; + + size_t _numElements; +}; + +template +Streamer::Streamer(size_t numElements, bool useNullStream) : + _numElements(numElements) +{ + HipTest::initArrays (&_A_d, &_B_d, &_C_d, &_A_h, &_B_h, &_C_h, numElements, true); + + if (useNullStream) { + _stream = 0x0; + } else { + HIPCHECK(hipStreamCreate(&_stream)); + } + HIPCHECK(hipEventCreate(&_event)); +}; + +template +void Streamer::enqueAsync() +{ + printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat); + +} + +template +void Streamer::queryUntilComplete() +{ + int numQueries = 0; + hipError_t e = hipSuccess; + do { + numQueries++; + e = hipStreamQuery(_stream); + } while (e != hipSuccess) ; + + printf ("completed after %d queries\n", numQueries); +}; + + + +//--- +//Parse arguments specific to this test. +void parseMyArguments(int argc, char *argv[]) +{ + int more_argc = HipTest::parseStandardArguments(argc, argv, false); + + // parse args for this test: + for (int i = 1; i < more_argc; i++) { + const char *arg = argv[i]; + + if (!strcmp(arg, "--streams")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { + failed("Bad streams argument"); + } + } else { + failed("Bad argument '%s'", arg); + } + }; +}; + + + + + +//--- +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, false); + parseMyArguments(argc, argv); + + typedef Streamer FloatStreamer; + + std::vector streamers; + + size_t numElements = N; + + float *expected_H = (float*)malloc(numElements*sizeof(float)); + + + auto nullStreamer = new FloatStreamer(numElements, true); + for (size_t i=0; i_A_h[i]*p_repeat + nullStreamer->_B_h[i] * p_repeat; + } + + + for (int i=0; i Test 0x1 runAsnc\n"); + for (int i=0; ienqueAsync(); + } + + auto lastStreamer = streamers[p_streams - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete. + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + HIPCHECK(hipMemcpy(nullStreamer->_C_h, nullStreamer->_C_d, numElements*sizeof(float), hipMemcpyDeviceToHost)); + HIPCHECK(hipStreamSynchronize(0)); + + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + } + + + if (p_tests & 0x2) { + printf ("==> Test 0x2 runAsnc-odd-only\n"); + for (int i=0; ienqueAsync(); + } + } + } + + + passed(); +} From 704ba30b32c55548797160ace632d70af7e631a9 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 17:01:35 -0500 Subject: [PATCH 085/171] Doc update - split hip_debugging.md into separate file. --- docs/markdown/hip_debugging.md | 168 +++++++++++++++++++++++++++++++++ docs/markdown/hip_faq.md | 4 +- docs/markdown/hip_profiling.md | 160 +------------------------------ 3 files changed, 171 insertions(+), 161 deletions(-) create mode 100644 docs/markdown/hip_debugging.md diff --git a/docs/markdown/hip_debugging.md b/docs/markdown/hip_debugging.md new file mode 100644 index 0000000000..e7e058d17a --- /dev/null +++ b/docs/markdown/hip_debugging.md @@ -0,0 +1,168 @@ +Table of Contents +================= + + * [Profiling HIP Code](#profiling-hip-code" aria-hidden="true">" in the printed output. +This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function.. +The gdb syntax also supports using the variable name (in this case 'dst'): +``` +(gdb) p dst +$33 = (void *) 0x5ec7e9000 +(gdb) call hc::am_memtracker_print(dst) +TargetAddress:0x5ec7e9000 + 0x504cfc000-0x504cfc00f:: allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) +... +-->0x5ec7e9000-0x5f7e28fff:: allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) + +``` + +To debug an explicit address, cast the address to (void*) : +``` +(gdb) call hc::am_memtracker_print((void*)0x508c7f000) +``` +- Debugging GPUVM fault. +For example: +``` +Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege. + +Program received signal SIGABRT, Aborted. +[Switching to Thread 0x7fffdffb5700 (LWP 14893)] +0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 +56 ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory. +(gdb) bt +#0 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 +#1 0x00007ffff205b028 in __GI_abort () at abort.c:89 +#2 0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#3 0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#4 0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#5 0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312 +#6 0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111 +(gdb) info threads + Id Target Id Frame + 4 Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 + 3 Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 +* 2 Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 + 1 Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +(gdb) thread 1 +[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))] +#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +(gdb) bt +#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#1 0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#2 0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so +#3 0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so +#4 0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so +#5 0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so +#6 0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15 +... +``` + +### General Debugging Tips +- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU. So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above. +- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3. This will force HCC to wait for the kernel to finish executing before retuning. If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace. A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so. +- VM faults inside kernels can be caused byi: + - incorrect code (ie a for loop which extends past array boundaries), i + - memory issues - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers). + - synchronization issues + - compiler issues (incorrect code generation from the compiler) + - runtime issues + +-- General debug tips: +- 'gdb --args' can be used to conviently pass the executable and arguments to gdb. +- From inside GDB, you can set environment variables "set env". Note the command does not use an '=' sign: +``` +(gdb) set env HIP_DB 1 +``` + +#### Print env var state +Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info. +Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info. diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md index e316d449ef..07ec5f1d8b 100644 --- a/docs/markdown/hip_faq.md +++ b/docs/markdown/hip_faq.md @@ -53,7 +53,7 @@ At a high-level, the following features are not supported: - Dynamic parallelism (CUDA 5.0) - Managed memory (CUDA 6.5) - Graphics interoperability with OpenGL or Direct3D -- CUDA Driver API (Under Development) +- CUDA Driver API - CUDA IPC Functions (Under Development) - CUDA array, mipmappedArray and pitched memory - MemcpyToSymbol functions @@ -102,7 +102,7 @@ However, we can provide a rough summary of the features included in each CUDA SD - Per-thread-streams (under development) - C++11 (HCC supports all of C++11, all of C++14 and some C++17 features) - CUDA 7.5 - - float16 (under development) + - float16 - CUDA 8.0 - TBD. diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md index b5c4672464..ef349ef2a5 100644 --- a/docs/markdown/hip_profiling.md +++ b/docs/markdown/hip_profiling.md @@ -1,4 +1,4 @@ -# Profiling and Debugging HIP Code +# Profiling HIP Code This section describes the profiling and debugging capabilities that HIP provides. Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs. @@ -280,161 +280,3 @@ None will disable use of color control codes for both the opening and closing an -### Using HIP_DB - -This flag is primarily targeted to assist HIP development team in the development of the HIP runtime, but in some situations may be useful to HIP application developers as well. -The HIP debug information is designed to print important information during the execution of a HIP API. HIP provides -different color-coded levels of debug information: - - api : Print the beginning and end of each HIP API, including the arguments and return codes. This is equivalent to setting HIP_TRACE_API=1. - - sync : Print multi-thread and other synchronization debug information. - - copy : Print which engine is doing the copy, which copy flavor is selected, information on source and destination memory. - - mem : Print information about memory allocation - which pointers are allocated, where they are allocated, peer mappings, and more. - -DB_MEM format is flags separated by '+' sign, or a hex code for the bitmask. Generally the + format is preferred. -For example: -``` -$ HIP_DB=api+copy+mem my-application -$ HIP_DB=0xF my-application -``` - -### Using ltrace -ltrace is a standard linux tool which provides a message to stderr on every dynamic library call. Since ROCr and the ROCt (the ROC thunk, which is the thin user-space interface to the ROC kernel driver) are both dynamic libraries, this provides an easy way to trace the activity in these libraries. Tracing can be a powerful way to quickly observe the flow of the application before diving into the details with a command-line debugger. -The trace can also show performance issues related to accidental calls to expensive API calls on the critical path. - -ltrace can be easily combined with the HIP_DB switches to visualize the runtime behavior of the entire ROCm software stack. Here's a sample command-line and output: - -``` -$ HIP_DB=api ltrace -C -e 'hsa*' - -... - -<hsa_signal_store_relaxed(0x1804000, 0, 0, 0x400000) = 0 -libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1816000, 0, 0x7f777f85f2a0, 0x400000) = 0 -libmcwamp_hsa.so->hsa_amd_memory_lock(0x7f7776d3e010, 0x400000, 0x1213b70, 1 -libhsa-runtime64.so.1->hsaKmtRegisterMemoryToNodes(0x7f7776d3e010, 0x400000, 1, 0x1220c10) = 0 -libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f7776d3e010, 0x400000, 0x7ffc32865400, 64) = 0 -<... hsa_amd_memory_lock resumed> ) = 0 -libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 1, 0x7f777e95a770, 0x12205b0) = 0 -libmcwamp_hsa.so->hsa_amd_memory_async_copy(0x50411d010, 0x11e70d0, 0x503d1d000, 0x11e70d0) = 0 -libmcwamp_hsa.so->hsa_signal_wait_acquire(0x1804000, 2, 1, -1) = 0 -libmcwamp_hsa.so->hsa_amd_memory_unlock(0x7f7776d3e010, 0x1213c6c, 0x12c3c600000000, 0x1804000 -libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f7776d3e010, 0x7f7776d3e010, 0x12c3c600000000, 0x1804000) = 0 -libhsa-runtime64.so.1->hsaKmtDeregisterMemory(0x7f7776d3e010, 0x7f7776d3e010, 0x7f777f60f9e8, 0x1220580) = 0 -<... hsa_amd_memory_unlock resumed> ) = 0 - hip-api tid:1.17 hipMemcpy ret= 0 (hipSuccess)>> -``` - -Some key information from the trace above. - - Thy trace snippet shows the execution of a hipMemcpy API, bracketed by the first and last message in the trace output. The messages show the thread id and API sequence number (`1.17`). ltrace output intermixes messages from all threads, so the HIP debug information can be useful to determine which threads are executing. - - The code flows through HIP APIs into ROCr (HSA) APIs (hsa*) and into the thunk (hsaKmt*) calls. - - The HCC runtime is "libmcwamp_hsa.so" and the HSA/ROCr runtime is "libhsa-runtime64.so". - - In this particular case, the memory copy is for unpinned memory, and the selected copy algorithm is to pin the host memory "in-place" before performing the copy. The signaling APIs and calls to pin ("lock", "register") the memory are readily apparent in the trace output. - - -### Chicken bits -Chicken bits are environment variables which cause the HIP, HCC, or HSA driver to disable some feature or optimization. -These are not intended for production but can be useful diagnose synchronization problems in the application (or driver). - -Some of the most useful chicken bits are described here. These bits are supported on the ROCm path: - -HIP provides 3 environment variables in the HIP_*_BLOCKING family. These introduce additional synchronization and can be useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels are executing correctly, and any failures likely are causes by improper or missing synchronization. These flags will have performance impact and are not intended for production use. - -- HIP_LAUNCH_BLOCKING=1 : Waits on the host after each kernel launch. Equivalent to setting CUDA_LAUNCH_BLOCKING. -- HIP_LAUNCH_BLOCKING_KERNELS: A comma-separated list of kernel names. The HIP runtime will wait on the host after one of the named kernels executes. This provides a more targeted version of HIP_LAUNCH_BLOCKING and may be useful to isolate exactly which kernel needs further analysis if HIP_LAUNCH_BLOCKING=1 improves functionality. There is no indication if kernel names are spelled incorrectly. One mechanism to verify that the blocking is working is to run with HIP_DB=api+sync and search for debug messages with "LAUNCH_BLOCKING". -- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller. - -These options cause HCC to serialize. Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP. -- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize. -- HCC_SERIALIZE_COPY : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize. - -- HSA_ENABLE_SDMA=0 : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines. Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine. This flag is useful to isolate issues with the hardware copy engines. -- HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts. Can be useful to diagnose interrupt storm issues in the driver. -- HSA_DISABLE_CACHE=1 : Disables the GPU L2 data cache. - -### Debugging HIP Applications - -- The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread. This can be useful for setting conditional breakpoints. Also, each new HIP thread is mapped to monotically increasing shortTid ID. Both of these fields are displayed in the HIP debug info. -``` -(gdb) p tls_tidInfo -$32 = {_shortTid = 1, _apiSeqNum = 803} -``` - -- HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc". -If the HCC runtime is built with debug information (HCC_RUNTIME_DEBUG=ON when building HCC), then calling the function 'hc::am_memtracker_print()' will show all memory allocations. -An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output. -This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function.. -The gdb syntax also supports using the variable name (in this case 'dst'): -``` -(gdb) p dst -$33 = (void *) 0x5ec7e9000 -(gdb) call hc::am_memtracker_print(dst) -TargetAddress:0x5ec7e9000 - 0x504cfc000-0x504cfc00f:: allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) -... --->0x5ec7e9000-0x5f7e28fff:: allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) - -``` - -To debug an explicit address, cast the address to (void*) : -``` -(gdb) call hc::am_memtracker_print((void*)0x508c7f000) -``` -- Debugging GPUVM fault. -For example: -``` -Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege. - -Program received signal SIGABRT, Aborted. -[Switching to Thread 0x7fffdffb5700 (LWP 14893)] -0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 -56 ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory. -(gdb) bt -#0 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 -#1 0x00007ffff205b028 in __GI_abort () at abort.c:89 -#2 0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#3 0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#4 0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#5 0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312 -#6 0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111 -(gdb) info threads - Id Target Id Frame - 4 Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 - 3 Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 -* 2 Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 - 1 Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -(gdb) thread 1 -[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))] -#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -(gdb) bt -#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#1 0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#2 0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so -#3 0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so -#4 0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so -#5 0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so -#6 0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15 -... -``` - -### General Debugging Tips -- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU. So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above. -- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3. This will force HCC to wait for the kernel to finish executing before retuning. If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace. A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so. -- VM faults inside kernels can be caused byi: - - incorrect code (ie a for loop which extends past array boundaries), i - - memory issues - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers). - - synchronization issues - - compiler issues (incorrect code generation from the compiler) - - runtime issues - --- General debug tips: -- 'gdb --args' can be used to conviently pass the executable and arguments to gdb. -- From inside GDB, you can set environment variables "set env". Note the command does not use an '=' sign: -``` -(gdb) set env HIP_DB 1 -``` -Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info. -Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info. - - - From c67b828a5aae908ec3608f1be46919ebaab67fd7 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 17:02:31 -0500 Subject: [PATCH 086/171] Update tests README --- tests/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/README.md b/tests/README.md index 223bd149dc..cb41cc10cd 100644 --- a/tests/README.md +++ b/tests/README.md @@ -59,5 +59,9 @@ Find the test and commandline that fail: grep -IR hipMemcpy-modes -IR ../tests/ ../tests/src/runtimeApi/memory/hipMemcpy.cpp: * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 +# Guidelines for adding new tests +- Prefer to enhance an existing test as opposed to writing a new one. Tests have overhead to start and many small tests spend precious test time on startup and initialization issues. +- Make the test run standalone without requirement for command-line arguments. THis makes it easier to debug since the name of the test is shown in the test report and if you know the name of the test you can the run the test. +- For long-running tests or tests with multiple phases, consider using the --tests option as an optional mechanism to allow debuggers to start with the failing subset of the test. From 27877f8854c88c6b806b5528c81faf9009ccab78 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 17:04:23 -0500 Subject: [PATCH 087/171] Add initial HIP_SYNC_NULL_STREAM=0 mode. This eliminates host-synchronization for null stream. Instead, the null-stream uses GPU-side events to wait for other streams. Default is OFF pending additional testing. Add enhanced null-stream test. Also refine HIP_TRACE_API. --- include/hip/hcc_detail/hip_runtime_api.h | 7 +- src/grid_launch.cpp | 2 +- src/hip_device.cpp | 2 +- src/hip_event.cpp | 16 +- src/hip_hcc.cpp | 145 +++++++++++----- src/hip_hcc_internal.h | 12 +- src/hip_memory.cpp | 8 +- src/hip_module.cpp | 4 +- src/hip_stream.cpp | 4 +- tests/src/runtimeApi/stream/hipNullStream.cpp | 156 ++++++++++++++---- tests/src/test_common.h | 60 ++++++- 11 files changed, 320 insertions(+), 96 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 175fd64d29..e1aecef1e8 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -602,9 +602,12 @@ hipError_t hipStreamQuery(hipStream_t stream); * * @return #hipSuccess, #hipErrorInvalidResourceHandle * - * If the null stream is specified, this command blocks until all + * This command is host-synchronous : the host will block until the specified stream is empty. + * + * This command follows standard null-stream semantics. Specifically, specifying the null stream will cause the + * command to wait for other streams on the same device to complete all pending operations. + * * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active or blocking. - * This command is host-synchronous : the host will block until the stream is empty. * * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamWaitEvent, hipStreamDestroy * diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp index cac01df7dc..ffa50dec95 100644 --- a/src/grid_launch.cpp +++ b/src/grid_launch.cpp @@ -52,7 +52,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream) { - if ((HIP_TRACE_API & (1 << TRACE_CMD)) || + if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API || (COMPILE_HIP_DB && HIP_TRACE_API)) { std::stringstream os; diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 01a213190f..93c1c20484 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -298,7 +298,7 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device) hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) { HIP_INIT_API(props, device); - return ihipGetDeviceProperties(props, device); + return ihipLogStatus(ihipGetDeviceProperties(props, device)); } hipError_t hipSetDeviceFlags( unsigned int flags) diff --git a/src/hip_event.cpp b/src/hip_event.cpp index 61ac5cd3ab..fbaf5cc463 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -114,14 +114,17 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) HIP_INIT_API(event, stream); if (event && event->_state != hipEventStatusUnitialized) { + stream = ihipSyncAndResolveStream(stream); + event->_stream = stream; - if (stream == NULL) { + if (HIP_SYNC_NULL_STREAM && stream == NULL) { + + // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0 + // If stream == NULL, wait on all queues. - // TODO-HCC fix this - is this conservative or still uses device timestamps? - // TODO-HCC can we use barrier or event marker to implement better solution? ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true); + ctx->locked_syncDefaultStream(true, true); event->_timestamp = hc::get_system_ticks(); event->_state = hipEventStatusRecorded; @@ -164,9 +167,10 @@ hipError_t hipEventSynchronize(hipEvent_t event) } else if (event->_state == hipEventStatusCreated ) { // Created but not actually recorded on any device: return ihipLogStatus(hipSuccess); - } else if (event->_stream == NULL) { + } else if (HIP_SYNC_NULL_STREAM && (event->_stream == NULL)) { auto *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true); + // TODO-HIP_SYNC_NULL_STREAM - can remove this code + ctx->locked_syncDefaultStream(true, true); return ihipLogStatus(hipSuccess); } else { event->_marker.wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 07604fe85d..979a2e5028 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -92,6 +92,9 @@ int HIP_COHERENT_HOST_ALLOC = 0; // USE_ HIP_SYNC_HOST_ALLOC int HIP_SYNC_HOST_ALLOC = 1; +// Sync on host between +int HIP_SYNC_NULL_STREAM = 1; + int HCC_OPT_FLUSH = 0; @@ -289,6 +292,32 @@ inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCri assert(streamCrit->_hasQueue); } +hc::hcWaitMode ihipStream_t::waitMode() const +{ + hc::hcWaitMode waitMode = hc::hcWaitModeActive; + + if (_scheduleMode == Auto) { + if (g_deviceCnt > g_numLogicalThreads) { + waitMode = hc::hcWaitModeActive; + } else { + waitMode = hc::hcWaitModeBlocked; + } + } else if (_scheduleMode == Spin) { + waitMode = hc::hcWaitModeActive; + } else if (_scheduleMode == Yield) { + waitMode = hc::hcWaitModeBlocked; + } else { + assert(0); // bad wait mode. + } + + if (HIP_WAIT_MODE == 1) { + waitMode = hc::hcWaitModeBlocked; + } else if (HIP_WAIT_MODE == 2) { + waitMode = hc::hcWaitModeActive; + } + + return waitMode; +} //Wait for all kernel and data copy commands in this stream to complete. //This signature should be used in routines that already have locked the stream mutex @@ -296,29 +325,8 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit) { if (crit->_hasQueue) { tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); - hc::hcWaitMode waitMode = hc::hcWaitModeActive; - if (_scheduleMode == Auto) { - if (g_deviceCnt > g_numLogicalThreads) { - waitMode = hc::hcWaitModeActive; - } else { - waitMode = hc::hcWaitModeBlocked; - } - } else if (_scheduleMode == Spin) { - waitMode = hc::hcWaitModeActive; - } else if (_scheduleMode == Yield) { - waitMode = hc::hcWaitModeBlocked; - } else { - assert(0); // bad wait mode. - } - - if (HIP_WAIT_MODE == 1) { - waitMode = hc::hcWaitModeBlocked; - } else if (HIP_WAIT_MODE == 2) { - waitMode = hc::hcWaitModeActive; - } - - crit->_av.wait(waitMode); + crit->_av.wait(waitMode()); } else { tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str()); } @@ -337,7 +345,7 @@ void ihipStream_t::locked_wait() }; // Causes current stream to wait for specified event to complete: -// Note this does not require any kind of host serialization. +// Note this does not provide any kind of host serialization. void ihipStream_t::locked_waitEvent(hipEvent_t event) { LockedAccessor_StreamCrit_t crit(_criticalData); @@ -1061,26 +1069,57 @@ ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit) // Implement "default" stream syncronization // This waits for all other streams to drain before continuing. // If waitOnSelf is set, this additionally waits for the default stream to empty. -void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf) +// In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to wait for other +// activity, but doesn't actually block the host. If host blocking is desired, the caller should set syncHost. +// Note HIP_SYNC_NULL_STREAM=1 path always sync to Host. +void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) { LockedAccessor_CtxCrit_t crit(_criticalData); - tprintf(DB_SYNC, "syncDefaultStream\n"); + tprintf(DB_SYNC, "syncDefaultStream \n"); + + // Vector of ops sent to each stream that will complete before ops sent to null stream: + std::vector depOps; for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) { ihipStream_t *stream = *streamI; - // Don't wait for streams that have "opted-out" of syncing with NULL stream. - // And - don't wait for the NULL stream - if (!(stream->_flags & hipStreamNonBlocking)) { + if (HIP_SYNC_NULL_STREAM) { - if (waitOnSelf || (stream != _defaultStream)) { - // TODO-hcc - use blocking or active wait here? - // TODO-sync - cudaDeviceBlockingSync - stream->locked_wait(); + // Don't wait for streams that have "opted-out" of syncing with NULL stream. + // And - don't wait for the NULL stream + if (!(stream->_flags & hipStreamNonBlocking)) { + + if (waitOnSelf || (stream != _defaultStream)) { + stream->locked_wait(); + } + } + } else { + if (!(stream->_flags & hipStreamNonBlocking) && (stream != _defaultStream)) { + LockedAccessor_StreamCrit_t streamCrit(stream->_criticalData); + + // The last marker will provide appropriate visibility: + if (!streamCrit->_av.get_is_empty()) { + depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope)); + } } } } + + + + // Enqueue a barrier to wait on all the barriers we sent above: + if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) { + LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->_criticalData); + tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams\n", depOps.size()); + hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker(depOps.begin(), depOps.end(), hc::accelerator_scope); + if (syncHost) { + defaultCf.wait(); // TODO - account for active or blocking here. + } + } + + tprintf(DB_SYNC, " syncDefaultStream depOps=%zu\n", depOps.size()); + } @@ -1267,6 +1306,7 @@ void HipReadEnv() READ_ENV_I(release, HIP_FAIL_SOC, 0, "Fault on Sub-Optimal-Copy, rather than use a slower but functional implementation. Bit 0x1=Fail on async copy with unpinned memory. Bit 0x2=Fail peer copy rather than use staging buffer copy"); READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations. May help stability"); + READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions"); // TODO - review, can we remove this? READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); @@ -1274,7 +1314,7 @@ void HipReadEnv() READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); - READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impact HCC. When set, use agent-scope flush rather than system-scope flush when possible."); + READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impacts HCC. When set, use agent-scope flush rather than system-scope flush when possible."); // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. if (HIP_DB && !COMPILE_HIP_DB) { @@ -1415,17 +1455,44 @@ void ihipInit() hipStream_t ihipSyncAndResolveStream(hipStream_t stream) { if (stream == hipStreamNull ) { - ihipCtx_t *device = ihipGetTlsDefaultCtx(); + ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); #ifndef HIP_API_PER_THREAD_DEFAULT_STREAM - device->locked_syncDefaultStream(false); + ctx->locked_syncDefaultStream(false, false); #endif - return device->_defaultStream; + return ctx->_defaultStream; } else { - // ALl streams have to wait for legacy default stream to be empty: + // All streams have to wait for legacy default stream to be empty: if (!(stream->_flags & hipStreamNonBlocking)) { - tprintf(DB_SYNC, "%s wait default stream\n", ToString(stream).c_str()); - stream->getCtx()->_defaultStream->locked_wait(); + if (HIP_SYNC_NULL_STREAM) { + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); + stream->getCtx()->_defaultStream->locked_wait(); + } else { + ihipStream_t *defaultStream = stream->getCtx()->_defaultStream; + + tprintf(DB_SYNC, "%s marker wait default stream\n", ToString(stream).c_str()); + + bool needMarker = false; + hc::completion_future dcf; + { + LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData()); + // TODO - could call create_blocking_marker(queue) + if (!defaultStreamCrit->_av.get_is_empty()) { + needMarker = true; + + // TODO - add "none_scope". + dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope); + } + } + + if (needMarker) { + // ensure any commands sent to this stream wait on the NULL stream before continuing + LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); + // TODO - could be "noret" version of create_blocking_marker + thisStreamCrit->_av.create_blocking_marker(dcf); + } + } } return stream; diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 7787242ca7..0d080f9225 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -66,6 +66,8 @@ extern int HIP_COHERENT_HOST_ALLOC; // Chicken bits for disabling functionality to work around potential issues: extern int HIP_SYNC_HOST_ALLOC; +extern int HIP_SYNC_NULL_STREAM; + // TODO - remove when this is standard behavior. extern int HCC_OPT_FLUSH; @@ -187,11 +189,11 @@ extern const char *API_COLOR_END; //--- -//HIP Trace modes +//HIP Trace modes - use with HIP_TRACE_API=... #define TRACE_ALL 0 // 0x1 #define TRACE_KCMD 1 // 0x2, kernel command #define TRACE_MCMD 2 // 0x4, memory command -#define TRACE_MEM 3 // 0x8 +#define TRACE_MEM 3 // 0x8, memory allocation or deallocation. //--- @@ -276,7 +278,7 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr); API_TRACE(0, __VA_ARGS__); -// Like above, but will trace with TRACE_CMD. +// Like above, but will trace with a specified "special" bit. // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU: // kernel launches, copy commands, memory sets, etc. #define HIP_INIT_SPECIAL_API(tbit, ...) \ @@ -521,8 +523,10 @@ public: void locked_waitEvent(hipEvent_t event); void locked_recordEvent(hipEvent_t event); + ihipStreamCritical_t &criticalData() { return _criticalData; }; //--- + hc::hcWaitMode waitMode() const; // Use this if we already have the stream critical data mutex: void wait(LockedAccessor_StreamCrit_t &crit); @@ -786,7 +790,7 @@ public: // Functions: void locked_removeStream(ihipStream_t *s); void locked_reset(); void locked_waitAllStreams(); - void locked_syncDefaultStream(bool waitOnSelf); + void locked_syncDefaultStream(bool waitOnSelf, bool syncHost); // Will allocate a queue and assign it to the needyStream: hc::accelerator_view stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream); diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index cef676b572..5501fec734 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -525,7 +525,7 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind) { - HIP_INIT_CMD_API(symbolName, dst, count, offset, kind); + HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind); if(symbolName == nullptr) { @@ -598,7 +598,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_CMD_API(symbolName, dst, count, offset, kind, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind, stream); if(symbolName == nullptr) { @@ -807,7 +807,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_CMD_API(dst, dpitch, src, spitch, width, height, kind, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind, stream); if(width > dpitch || width > spitch) return ihipLogStatus(hipErrorUnknown); hipError_t e = hipSuccess; @@ -1041,7 +1041,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes ) { - HIP_INIT_CMD_API(dst, value, sizeBytes); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes); hipError_t e = hipSuccess; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index b359e7a63c..da01f23769 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -352,14 +352,14 @@ hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *func = sym; hmod->funcTrack.push_back(*func); } - return ihipLogStatus(ret); + return ret; } hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name){ HIP_INIT_API(hfunc, hmod, name); - return ihipModuleGetSymbol(hfunc, hmod, name); + return ihipLogStatus(ihipModuleGetSymbol(hfunc, hmod, name)); } diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index d7f8717725..34b4bc8851 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -150,7 +150,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) if (stream == NULL) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/); + ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/); } else { stream->locked_wait(); e = hipSuccess; @@ -174,7 +174,7 @@ hipError_t hipStreamDestroy(hipStream_t stream) //--- Drain the stream: if (stream == NULL) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/); + ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true /*syncToHost*/); } else { stream->locked_wait(); e = hipSuccess; diff --git a/tests/src/runtimeApi/stream/hipNullStream.cpp b/tests/src/runtimeApi/stream/hipNullStream.cpp index f8d201cb51..380979f6bc 100644 --- a/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -27,8 +27,9 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" #include -unsigned p_streams = 6; +unsigned p_streams =16; int p_repeat = 10; +int p_db = 0; template @@ -45,7 +46,7 @@ vectorADDRepeat(hipLaunchParm lp, for (int j=1; j<=repeat;j++) { for (size_t i=offset; i::Streamer(size_t numElements, bool useNullStream) : HIPCHECK(hipStreamCreate(&_stream)); } HIPCHECK(hipEventCreate(&_event)); + + H2D(); + }; +template +void Streamer::H2D() +{ + HIPCHECK(hipMemcpy(_A_d, _A_h, _numElements*sizeof(T), hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(_B_d, _B_h, _numElements*sizeof(T), hipMemcpyHostToDevice)); +} + +template +void Streamer::D2H() +{ + HIPCHECK(hipMemcpy(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost)); +} + +template +void Streamer::reset() +{ + HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h); + H2D(); + +} + + template void Streamer::enqueAsync() { @@ -131,6 +161,10 @@ void parseMyArguments(int argc, char *argv[]) if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { failed("Bad streams argument"); } + } else if (!strcmp(arg, "--repeat") || (!strcmp(arg, "-r"))) { + if (++i >= argc || !HipTest::parseInt(argv[i], &p_repeat)) { + failed("Bad repeat argument"); + } } else { failed("Bad argument '%s'", arg); } @@ -138,6 +172,15 @@ void parseMyArguments(int argc, char *argv[]) }; +void +printBuffer(std::string name, int *f, size_t numElements) +{ + std::cout << name << "\n"; + for (size_t i=0; i FloatStreamer; + typedef Streamer IntStreamer; - std::vector streamers; + std::vector streamers; size_t numElements = N; - float *expected_H = (float*)malloc(numElements*sizeof(float)); + int *expected_H = (int*)malloc(numElements*sizeof(int)); - auto nullStreamer = new FloatStreamer(numElements, true); + auto nullStreamer = new IntStreamer(numElements, true); + + // Expected resultr - last streamer runs vectorADDRepeat, then nullstreamer adds lastStreamer->_C_d + lastStreamer->_C_d for (size_t i=0; i_A_h[i]*p_repeat + nullStreamer->_B_h[i] * p_repeat; + expected_H[i] = ((nullStreamer->_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat) *2; } for (int i=0; i Test 0x1 runAsnc\n"); - for (int i=0; ienqueAsync(); + for (int s=1; s Test %x runAsnc, #streams=%d\n", (1<reset(); + + for (int i=0; ienqueAsync(); + } + + auto lastStreamer = streamers[s - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + + + if (p_db) { + HIPCHECK(hipDeviceSynchronize()); + lastStreamer->D2H(); + printBuffer("lastStream _A_h", lastStreamer->_A_h, min(numElements, size_t(20))); + printBuffer("lastStream _B_h", lastStreamer->_B_h, min(numElements, size_t(20))); + printBuffer("lastStream _C_h", lastStreamer->_C_h, min(numElements, size_t(20))); + } + nullStreamer->D2H(); + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } - - auto lastStreamer = streamers[p_streams - 1]; - - // Dispatch to NULL stream, should wait for prior async activity to complete. - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); - HIPCHECK(hipMemcpy(nullStreamer->_C_h, nullStreamer->_C_d, numElements*sizeof(float), hipMemcpyDeviceToHost)); - HIPCHECK(hipStreamSynchronize(0)); - - - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } - if (p_tests & 0x2) { - printf ("==> Test 0x2 runAsnc-odd-only\n"); - for (int i=0; ienqueAsync(); + for (int s=1; sreset(); + printf ("==> Test %x runAsnc-odd-only, #streams=%d\n", tmask, s); + for (int i=0; ienqueAsync(); + } } + auto lastStreamer = streamers[s - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + + nullStreamer->D2H(); + + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } + // Expected resultr - last streamer runs vectorADDRepeat + for (size_t i=0; i_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat); + } + + if (p_tests & 0x20000) { + + assert (p_streams >=2); // need a couple streams in order to run this test. + nullStreamer->reset(); + printf ("\n==> Test hipStreamSynchronize with defaultStream \n"); + + // Enqueue a long-running job to stream1 + streamers[0]->enqueAsync(); + + // Check to see if synchronizing on a null stream synchronizes all other streams or just the null stream. + // This function follows null stream semantics and will wait for all other blocking streams before returning. + // This will wait on the host + HIPCHECK(hipStreamSynchronize(0)); + + // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams. + HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream)); + + + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); + } + passed(); } diff --git a/tests/src/test_common.h b/tests/src/test_common.h index 633ee6f825..1a6e51e08e 100644 --- a/tests/src/test_common.h +++ b/tests/src/test_common.h @@ -184,6 +184,20 @@ addCountReverse( const T *A_d, } +void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) +{ + // Initialize the host data: + for (size_t i=0; i void initArraysForHost(T **A_h, T **B_h, T **C_h, size_t N, bool usePinnedHost=false) @@ -217,15 +231,10 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h, } } - // Initialize the host data: - for (size_t i=0; i void initArrays(T **A_d, T **B_d, T **C_d, T **A_h, T **B_h, T **C_h, @@ -367,6 +376,43 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true } +// Assumes C_h contains vector add of A_h + B_h +// Calls the test "failed" macro if a mismatch is detected. +template +void checkTest(T* expected_H, T* result_H, size_t N, bool expectMatch=true) +{ + size_t mismatchCount = 0; + size_t firstMismatch = 0; + size_t mismatchesToPrint = 10; + for (size_t i=0; i Date: Sat, 13 May 2017 16:00:26 +0000 Subject: [PATCH 088/171] Fix HIP_TRACE_API so kernel launch only printed when requested. --- src/grid_launch.cpp | 2 +- tests/src/test_common.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/grid_launch.cpp b/src/grid_launch.cpp index ffa50dec95..f3b28c5f60 100644 --- a/src/grid_launch.cpp +++ b/src/grid_launch.cpp @@ -54,7 +54,7 @@ namespace hip_impl { if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API || - (COMPILE_HIP_DB && HIP_TRACE_API)) { + (COMPILE_HIP_DB && (HIP_TRACE_API & (1< void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) { // Initialize the host data: From 2e1fec47ab44e3716c49e6f5f67a3512a45d3c46 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 16 May 2017 18:56:40 -0500 Subject: [PATCH 089/171] Make hipMultiThreadStreams1 test a little harsher. Fail faster if synchronization rules are violated. Run vectorAddRevers to read last elements of array first - if the vector add kernel starts before preceding copy finishes we will read stale data and flag the error. Increase default array sizes, so synchronization errors more easily exposed. --- .../multiThread/hipMultiThreadStreams1.cpp | 45 +++++++++++++++---- tests/src/test_common.h | 39 ++++++++++++---- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp index 229ceea440..4f73b67ad7 100644 --- a/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp +++ b/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp @@ -29,6 +29,8 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" +int p_iters=10; + void printSep() { printf ("======================================================================================\n"); @@ -43,7 +45,7 @@ template< class P=HipTest::Unpinned, class C=HipTest::Memcpy > -void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) +void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) { using HipTest::MemTraits; @@ -57,6 +59,24 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) T *A_h, *B_h, *C_h; HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, P::isPinned); + for (size_t i=0; i::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream); + MemTraits::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream); + MemTraits::Copy(C_d, C_h, Nbytes, hipMemcpyHostToDevice, stream); + HIPCHECK (hipDeviceSynchronize()); + + for (size_t i=0; i::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream); MemTraits::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + //HIPCHECK(hipStreamSynchronize(stream)); + + // This is the null stream? + //hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); @@ -76,9 +100,9 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) } HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, P::isPinned); + std::cout <<" pid" << pid << " success\n"; HIPCHECK (hipDeviceSynchronize()); - std::cout <<" pid" << pid << " success\n"; } template @@ -88,12 +112,14 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s printf ("%s\n", __func__); std::cout << testName << std::endl; + size_t numElements = N; + // Test 2 threads operating on same stream: - std::thread t1 (simpleVectorCopy, 2000000/*mb*/, 100/*iters*/, stream0); + std::thread t1 (simpleVectorAdd, numElements, p_iters/*iters*/, stream0); if (serialize) { t1.join(); } - std::thread t2 (simpleVectorCopy, 2000000/*mb*/, 100/*iters*/, stream1); + std::thread t2 (simpleVectorAdd, numElements, p_iters/*iters*/, stream1); if (serialize) { t2.join(); } @@ -109,6 +135,7 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s int main(int argc, char *argv[]) { + N = 8000000; HipTest::parseStandardArguments(argc, argv, true); printf ("info: set device to %d\n", p_gpuDevice); @@ -121,8 +148,8 @@ int main(int argc, char *argv[]) hipStream_t stream; HIPCHECK (hipStreamCreate(&stream)); - simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); - simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); + simpleVectorAdd (N/*mb*/, 10/*iters*/, stream); + simpleVectorAdd (N/*mb*/, 10/*iters*/, stream); HIPCHECK(hipStreamDestroy(stream)); } @@ -139,8 +166,8 @@ int main(int argc, char *argv[]) } if (p_tests & 0x4) { - test_multiThread_1 ("Multithread with NULL stream", NULL, NULL, false); - test_multiThread_1 ("Multithread with two streams", stream0, stream1, false); + //test_multiThread_1 ("Multithread with NULL stream", NULL, NULL, false); + //test_multiThread_1 ("Multithread with two streams", stream0, stream1, false); test_multiThread_1 ("Multithread with one stream", stream0, stream0, false); } diff --git a/tests/src/test_common.h b/tests/src/test_common.h index 2c6905eea2..bb44c94745 100644 --- a/tests/src/test_common.h +++ b/tests/src/test_common.h @@ -146,6 +146,23 @@ vectorADD(hipLaunchParm lp, } +template +__global__ void +vectorADDReverse(hipLaunchParm lp, + const T *A_d, + const T *B_d, + T *C_d, + size_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { + C_d[i] = A_d[i] + B_d[i]; + } +} + + template __global__ void addCount( const T *A_d, @@ -343,7 +360,7 @@ inline void initHIPArrays(hipArray **A_d, hipArray **B_d, hipArray **C_d, // Assumes C_h contains vector add of A_h + B_h // Calls the test "failed" macro if a mismatch is detected. template -void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true) +size_t checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true, bool reportMismatch=true) { size_t mismatchCount = 0; size_t firstMismatch = 0; @@ -364,15 +381,19 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true } } - if (expectMatch) { - if (mismatchCount) { - failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch); + if (reportMismatch) { + if (expectMatch) { + if (mismatchCount) { + failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch); + } + } else { + if (mismatchCount == 0) { + failed("expected mismatches but did not detect any!"); + } } - } else { - if (mismatchCount == 0) { - failed("expected mismatches but did not detect any!"); - } - } + } + + return mismatchCount; } From 9dceccf1365ad058a5ba383d5f83473278796d1b Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 16 May 2017 21:35:40 -0500 Subject: [PATCH 090/171] changed vector types to make sure it generate proper llvm vector types Change-Id: I6c4616dae137dc4eac35e5827dc5b7f3251e0247 --- include/hip/hcc_detail/hip_fp16.h | 125 +- include/hip/hcc_detail/hip_vector_types.h | 4067 +-------------------- src/hip_fp16.cpp | 442 +-- src/hip_hc_gfx803.ll | 147 +- 4 files changed, 266 insertions(+), 4515 deletions(-) diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h index 0a861b64af..f1f52e4122 100644 --- a/include/hip/hcc_detail/hip_fp16.h +++ b/include/hip/hcc_detail/hip_fp16.h @@ -25,17 +25,6 @@ THE SOFTWARE. #include "hip/hcc_detail/hip_vector_types.h" -#if __clang_major__ > 3 - -typedef __fp16 __half; - -typedef struct __attribute__((aligned(4))){ - union { - __half p[2]; - unsigned int q; - }; -} __half2; - typedef __half half; typedef __half2 half2; @@ -214,10 +203,10 @@ __device__ __half __ushort2half_ru(unsigned short int i); __device__ __half __ushort2half_rz(unsigned short int i); __device__ __half __ushort_as_half(const unsigned short int i); -extern "C" int __hip_hc_ir_hadd2_int(int, int); -extern "C" int __hip_hc_ir_hfma2_int(int, int, int); -extern "C" int __hip_hc_ir_hmul2_int(int, int); -extern "C" int __hip_hc_ir_hsub2_int(int, int); +extern "C" __half2 __hip_hc_ir_hadd2_int(__half2, __half2); +extern "C" __half2 __hip_hc_ir_hfma2_int(__half2, __half2, __half2); +extern "C" __half2 __hip_hc_ir_hmul2_int(__half2, __half2); +extern "C" __half2 __hip_hc_ir_hsub2_int(__half2, __half2); extern "C" __half __hip_hc_ir_hceil_half(__half) __asm("llvm.ceil.f16"); extern "C" __half __hip_hc_ir_hcos_half(__half) __asm("llvm.cos.f16"); @@ -231,16 +220,16 @@ extern "C" __half __hip_hc_ir_hsin_half(__half) __asm("llvm.sin.f16"); extern "C" __half __hip_hc_ir_hsqrt_half(__half) __asm("llvm.sqrt.f16"); extern "C" __half __hip_hc_ir_htrunc_half(__half) __asm("llvm.trunc.f16"); -extern "C" int __hip_hc_ir_h2ceil_int(int); -extern "C" int __hip_hc_ir_h2cos_int(int); -extern "C" int __hip_hc_ir_h2exp2_int(int); -extern "C" int __hip_hc_ir_h2floor_int(int); -extern "C" int __hip_hc_ir_h2log2_int(int); -extern "C" int __hip_hc_ir_h2rcp_int(int); -extern "C" int __hip_hc_ir_h2rsqrt_int(int); -extern "C" int __hip_hc_ir_h2sin_int(int); -extern "C" int __hip_hc_ir_h2sqrt_int(int); -extern "C" int __hip_hc_ir_h2trunc_int(int); +extern "C" __half2 __hip_hc_ir_h2ceil_int(__half2); +extern "C" __half2 __hip_hc_ir_h2cos_int(__half2); +extern "C" __half2 __hip_hc_ir_h2exp2_int(__half2); +extern "C" __half2 __hip_hc_ir_h2floor_int(__half2); +extern "C" __half2 __hip_hc_ir_h2log2_int(__half2); +extern "C" __half2 __hip_hc_ir_h2rcp_int(__half2); +extern "C" __half2 __hip_hc_ir_h2rsqrt_int(__half2); +extern "C" __half2 __hip_hc_ir_h2sin_int(__half2); +extern "C" __half2 __hip_hc_ir_h2sqrt_int(__half2); +extern "C" __half2 __hip_hc_ir_h2trunc_int(__half2); /* Half2 Arithmetic Functions @@ -248,63 +237,63 @@ extern "C" int __hip_hc_ir_h2trunc_int(int); __device__ static inline __half2 __hadd2(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hadd2_int(a.q, b.q); + c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hadd2_int(a.q, b.q); + c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) { __half2 d; - d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q); + d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy); return d; } __device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) { __half2 d; - d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q); + d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy); return d; } __device__ static inline __half2 __hmul2(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hmul2_int(a.q, b.q); + c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hmul2_int(a.q, b.q); + c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hsub2(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hsub2_int(a.q, b.q); + c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hneg2(__half2 a) { __half2 c; - c.p[0] = - a.p[0]; - c.p[1] = - a.p[1]; + c.x = - a.x; + c.y = - a.y; return c; } __device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hsub2_int(a.q, b.q); + c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy); return c; } __device__ static inline __half2 h2div(__half2 a, __half2 b) { __half2 c; - c.p[0] = a.p[0] / b.p[0]; - c.p[1] = a.p[1] / b.p[1]; + c.x = a.x / b.x; + c.y = a.y / b.y; return c; } @@ -375,112 +364,94 @@ Half2 Math Operations __device__ static inline __half2 h2ceil(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2ceil_int(h.q); + a.xy = __hip_hc_ir_h2ceil_int(h.xy); return a; } __device__ static inline __half2 h2cos(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2cos_int(h.q); + a.xy = __hip_hc_ir_h2cos_int(h.xy); return a; } __device__ static inline __half2 h2exp(const __half2 h) { __half2 factor; - factor.p[0] = 1.442694; - factor.p[1] = 1.442694; - factor.q = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.q, factor.q)); + factor.x = 1.442694; + factor.y = 1.442694; + factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy)); return factor; } __device__ static inline __half2 h2exp10(const __half2 h) { __half2 factor; - factor.p[0] = 3.3219281; - factor.p[1] = 3.3219281; - factor.q = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.q, factor.q)); + factor.x = 3.3219281; + factor.y = 3.3219281; + factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy)); return factor; } __device__ static inline __half2 h2exp2(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2exp2_int(h.q); + a.xy = __hip_hc_ir_h2exp2_int(h.xy); return a; } __device__ static inline __half2 h2floor(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2floor_int(h.q); + a.xy = __hip_hc_ir_h2floor_int(h.xy); return a; } __device__ static inline __half2 h2log(const __half2 h) { __half2 factor; - factor.p[0] = 0.693147; - factor.p[1] = 0.693147; - factor. q = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.q), factor.q); + factor.x = 0.693147; + factor.y = 0.693147; + factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy); return factor; } __device__ static inline __half2 h2log10(const __half2 h) { __half2 factor; - factor.p[0] = 0.301029; - factor.p[1] = 0.301029; - factor.q = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.q), factor.q); + factor.x = 0.301029; + factor.y = 0.301029; + factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy); return factor; } __device__ static inline __half2 h2log2(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2log2_int(h.q); + a.xy = __hip_hc_ir_h2log2_int(h.xy); return a; } __device__ static inline __half2 h2rcp(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2rcp_int(h.q); + a.xy = __hip_hc_ir_h2rcp_int(h.xy); return a; } __device__ static inline __half2 h2rsqrt(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2rsqrt_int(h.q); + a.xy = __hip_hc_ir_h2rsqrt_int(h.xy); return a; } __device__ static inline __half2 h2sin(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2sin_int(h.q); + a.xy = __hip_hc_ir_h2sin_int(h.xy); return a; } __device__ static inline __half2 h2sqrt(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2sqrt_int(h.q); + a.xy = __hip_hc_ir_h2sqrt_int(h.xy); return a; } __device__ static inline __half2 h2trunc(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2trunc_int(h.q); + a.xy = __hip_hc_ir_h2trunc_int(h.xy); return a; } -#endif - -#if __clang_major__ == 3 - -typedef struct { - unsigned x: 16; -} __half; - -typedef struct __attribute__((aligned(4))){ - union { - __half p[2]; - unsigned int q; - }; -} __half2; - - -#endif - #endif diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h index 35c6c23548..251da504ab 100644 --- a/include/hip/hcc_detail/hip_vector_types.h +++ b/include/hip/hcc_detail/hip_vector_types.h @@ -34,1120 +34,93 @@ THE SOFTWARE. #include "hip/hcc_detail/host_defines.h" -#define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x) { } \ -__device__ __host__ type(const type& val) : x(val.x) { } - -#define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } - -#define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } - -#define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } - - -#define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val) {} \ - -#define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val), y(val) {} \ -__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} - -#define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} - -#define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} - -struct uchar1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uchar1) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long long) - - #endif - unsigned char x; - -} __attribute__((aligned(1))); - -struct uchar2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uchar2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long long) - #endif - union { - struct { - unsigned char x, y; - }; - unsigned short a; - }; -} __attribute__((aligned(2))); - -struct uchar3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uchar3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long long) - #endif - unsigned char x, y, z; -}; - -struct uchar4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uchar4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long long) - #endif - union { - struct { - unsigned char x, y, z, w; - }; - unsigned int a; - }; -} __attribute__((aligned(4))); - - -struct char1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(char1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long long) - #endif - signed char x; -} __attribute__((aligned(1))); - -struct char2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(char2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long long) - #endif - union { - struct { - signed char x, y; - }; - unsigned short a; - }; -} __attribute__((aligned(2))); - -struct char3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(char3) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long long) - #endif - signed char x, y, z; -}; - -struct char4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(char4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long long) - #endif - union { - struct { - signed char x, y, z, w; - }; - unsigned int a; - }; -} __attribute__((aligned(4))); - - - -struct ushort1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ushort1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long long) - #endif - unsigned short x; -} __attribute__((aligned(2))); - -struct ushort2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ushort2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long long) - #endif - union { - struct { - unsigned short x, y; - }; - unsigned int a; - }; -} __attribute__((aligned(4))); - -struct ushort3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ushort3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long long) - #endif - unsigned short x, y, z; -}; - -struct ushort4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ushort4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long long) - #endif - union { - struct { - unsigned short x, y, z, w; - }; - unsigned int a, b; - }; -} __attribute__((aligned(8))); - -struct short1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(short1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long long) - #endif - signed short x; -} __attribute__((aligned(2))); - -struct short2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(short2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long long) - #endif - union { - struct { - signed short x, y; - }; - unsigned int a; - }; - -} __attribute__((aligned(4))); - -struct short3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(short3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long long) - #endif - signed short x, y, z; -}; - -struct short4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(short4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long long) - #endif - union { - struct { - signed short x, y, z, w; - }; - unsigned int a, b; - }; -} __attribute__((aligned(8))); - - -struct uint1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uint1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long long) - #endif - unsigned int x; -} __attribute__((aligned(4))); - -struct uint2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uint2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long long) - #endif - unsigned int x, y; -} __attribute__((aligned(8))); - -struct uint3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uint3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long long) - #endif - unsigned int x, y, z; -}; - -struct uint4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uint4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long long) - #endif - unsigned int x, y, z, w; -} __attribute__((aligned(16))); - -struct int1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(int1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long long) - #endif - signed int x; -} __attribute__((aligned(4))); - -struct int2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(int2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long long) - #endif - signed int x, y; -} __attribute__((aligned(8))); - -struct int3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(int3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long long) - #endif - signed int x, y, z; -}; - -struct int4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(int4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long long) - #endif - signed int x, y, z, w; -} __attribute__((aligned(16))); - - -struct float1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(float1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long long) - #endif - float x; -} __attribute__((aligned(4))); - -struct float2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(float2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long long) - #endif - float x, y; -} __attribute__((aligned(8))); - -struct float3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(float3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long long) - #endif - float x, y, z; -}; - -struct float4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(float4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long long) - #endif - float x, y, z, w; -} __attribute__((aligned(16))); - - - -struct double1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(double1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long long) - #endif - double x; -} __attribute__((aligned(8))); - -struct double2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(double2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long long) - #endif - double x, y; -} __attribute__((aligned(16))); - -struct double3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(double3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long long) - #endif - double x, y, z; -}; - -struct double4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(double4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long long) - #endif - double x, y, z, w; -} __attribute__((aligned(32))); - - -struct ulong1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulong1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long long) - #endif - unsigned long x; -} __attribute__((aligned(8))); - -struct ulong2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulong2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long long) - #endif - unsigned long x, y; -} __attribute__((aligned(16))); - -struct ulong3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulong3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long long) - #endif - unsigned long x, y, z; -}; - -struct ulong4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulong4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long long) - #endif - unsigned long x, y, z, w; -} __attribute__((aligned(32))); - - -struct long1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(long1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long long) - #endif - signed long x; -} __attribute__((aligned(8))); - -struct long2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(long2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long long) - #endif - signed long x, y; -} __attribute__((aligned(16))); - -struct long3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(long3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long long) - #endif - signed long x, y, z; -}; - -struct long4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(long4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long long) - #endif - signed long x, y, z, w; -} __attribute__((aligned(32))); - - -struct ulonglong1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long long) - #endif - unsigned long long x; -} __attribute__((aligned(8))); - -struct ulonglong2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long long) - #endif - unsigned long long x, y; -} __attribute__((aligned(16))); - -struct ulonglong3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long long) - #endif - unsigned long long x, y, z; -}; - -struct ulonglong4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long long) - #endif - unsigned long long x, y, z, w; -} __attribute__((aligned(32))); - - -struct longlong1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(longlong1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long long) - #endif - signed long long x; -} __attribute__((aligned(8))); - -struct longlong2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(longlong2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long long) - #endif - signed long long x, y; -} __attribute__((aligned(16))); - -struct longlong3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(longlong3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long long) - #endif - signed long long x, y, z; -}; - -struct longlong4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(longlong4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long long) - #endif - signed long x, y, z, w; -} __attribute__((aligned(32))); +#if __cplusplus + +typedef unsigned char uchar1 __attribute__((ext_vector_type(1))); +typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); +typedef unsigned char uchar3 __attribute__((ext_vector_type(3))); +typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); + +typedef signed char char1 __attribute__((ext_vector_type(1))); +typedef signed char char2 __attribute__((ext_vector_type(2))); +typedef signed char char3 __attribute__((ext_vector_type(3))); +typedef signed char char4 __attribute__((ext_vector_type(4))); + +typedef unsigned short ushort1 __attribute__((ext_vector_type(1))); +typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); +typedef unsigned short ushort3 __attribute__((ext_vector_type(3))); +typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); + +typedef signed short short1 __attribute__((ext_vector_type(1))); +typedef signed short short2 __attribute__((ext_vector_type(2))); +typedef signed short short3 __attribute__((ext_vector_type(3))); +typedef signed short short4 __attribute__((ext_vector_type(4))); + +typedef __fp16 __half; + +typedef __fp16 __half1 __attribute__((ext_vector_type(1))); +typedef __fp16 __half2 __attribute__((ext_vector_type(2))); +typedef __fp16 __half3 __attribute__((ext_vector_type(3))); +typedef __fp16 __half4 __attribute__((ext_vector_type(4))); + +typedef unsigned int uint1 __attribute__((ext_vector_type(1))); +typedef unsigned int uint2 __attribute__((ext_vector_type(2))); +typedef unsigned int uint3 __attribute__((ext_vector_type(3))); +typedef unsigned int uint4 __attribute__((ext_vector_type(4))); + +typedef signed int int1 __attribute__((ext_vector_type(1))); +typedef signed int int2 __attribute__((ext_vector_type(2))); +typedef signed int int3 __attribute__((ext_vector_type(3))); +typedef signed int int4 __attribute__((ext_vector_type(4))); + +typedef float float1 __attribute__((ext_vector_type(1))); +typedef float float2 __attribute__((ext_vector_type(2))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef float float4 __attribute__((ext_vector_type(4))); + +typedef unsigned long ulong1 __attribute__((ext_vector_type(1))); +typedef unsigned long ulong2 __attribute__((ext_vector_type(2))); +typedef unsigned long ulong3 __attribute__((ext_vector_type(3))); +typedef unsigned long ulong4 __attribute__((ext_vector_type(4))); + +typedef signed long long1 __attribute__((ext_vector_type(1))); +typedef signed long long2 __attribute__((ext_vector_type(2))); +typedef signed long long3 __attribute__((ext_vector_type(3))); +typedef signed long long4 __attribute__((ext_vector_type(4))); + +typedef double double1 __attribute__((ext_vector_type(1))); +typedef double double2 __attribute__((ext_vector_type(2))); +typedef double double3 __attribute__((ext_vector_type(3))); +typedef double double4 __attribute__((ext_vector_type(4))); + +typedef unsigned long long ulonglong1 __attribute__((ext_vector_type(1))); +typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); +typedef unsigned long long ulonglong3 __attribute__((ext_vector_type(3))); +typedef unsigned long long ulonglong4 __attribute__((ext_vector_type(4))); + +typedef signed long long longlong1 __attribute__((ext_vector_type(1))); +typedef signed long long longlong2 __attribute__((ext_vector_type(2))); +typedef signed long long longlong3 __attribute__((ext_vector_type(3))); +typedef signed long long longlong4 __attribute__((ext_vector_type(4))); #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x) { \ + type ret; \ ret.x = x; \ return ret; \ } #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x, comp y) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x, comp y) { \ + type ret; \ ret.x = x; \ ret.y = y; \ return ret; \ } #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x, comp y, comp z) { \ + type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -1155,8 +128,8 @@ __device__ __host__ static inline struct type make_##type(comp x, comp y, comp z } #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z, comp w) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x, comp y, comp z, comp w) { \ + type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -1164,6 +137,7 @@ __device__ __host__ static inline struct type make_##type(comp x, comp y, comp z return ret; \ } + DECLOP_MAKE_ONE_COMPONENT(unsigned char, uchar1); DECLOP_MAKE_TWO_COMPONENT(unsigned char, uchar2); DECLOP_MAKE_THREE_COMPONENT(unsigned char, uchar3); @@ -1225,2894 +199,9 @@ DECLOP_MAKE_THREE_COMPONENT(signed long, longlong3); DECLOP_MAKE_FOUR_COMPONENT(signed long, longlong4); -#if __cplusplus - -#define DECLOP_1VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - return ret; \ -} - -#define DECLOP_1VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - return ret; \ -} - -#define DECLOP_1VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - return lhs; \ -} - -#define DECLOP_1VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - return val; \ -} - -#define DECLOP_1VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - val.x op; \ - return ret; \ -} - -#define DECLOP_1VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return lhs.x op rhs.x; \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return lhs.x op rhs.x; \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return lhs.x op rhs.x ; \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return lhs.x op rhs.x ; \ -} - -#define DECLOP_1VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type& rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - return ret; \ -} - -#define DECLOP_1VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type& rhs) { \ - return op rhs.x; \ -} - -/* - Two Element Access -*/ - -#define DECLOP_2VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - ret.y = lhs.y op rhs.y; \ - return ret; \ -} - -#define DECLOP_2VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - ret.y = lhs.y * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - ret.y = lhs * rhs.y; \ - return ret; \ -} - -#define DECLOP_2VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - lhs.y op rhs.y; \ - return lhs; \ -} - -#define DECLOP_2VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - op val.y; \ - return val; \ -} - -#define DECLOP_2VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - ret.y = val.y; \ - val.x op; \ - val.y op; \ - return ret; \ -} - -#define DECLOP_2VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} - -#define DECLOP_2VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type &rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - ret.y = op rhs.y; \ - return ret; \ -} - -#define DECLOP_2VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type &rhs) { \ - return (op rhs.x) && (op rhs.y); \ -} - - -/* - Three Element Access -*/ - -#define DECLOP_3VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - ret.y = lhs.y op rhs.y; \ - ret.z = lhs.z op rhs.z; \ - return ret; \ -} - -#define DECLOP_3VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - ret.y = lhs.y * rhs; \ - ret.z = lhs.z * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - ret.y = lhs * rhs.y; \ - ret.z = lhs * rhs.z; \ - return ret; \ -} - -#define DECLOP_3VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - lhs.y op rhs.y; \ - lhs.z op rhs.z; \ - return lhs; \ -} - -#define DECLOP_3VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - op val.y; \ - op val.z; \ - return val; \ -} - -#define DECLOP_3VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - ret.y = val.y; \ - ret.z = val.z; \ - val.x op; \ - val.y op; \ - val.z op; \ - return ret; \ -} - -#define DECLOP_3VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ - -#define DECLOP_3VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type &rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - ret.y = op rhs.y; \ - ret.z = op rhs.z; \ - return ret; \ -} - -#define DECLOP_3VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type &rhs) { \ - return (op rhs.x) && (op rhs.y) && (op rhs.z); \ -} - - -/* - Four Element Access -*/ - -#define DECLOP_4VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op ( const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - ret.y = lhs.y op rhs.y; \ - ret.z = lhs.z op rhs.z; \ - ret.w = lhs.w op rhs.w; \ - return ret; \ -} - -#define DECLOP_4VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - ret.y = lhs.y * rhs; \ - ret.z = lhs.z * rhs; \ - ret.w = lhs.w * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - ret.y = lhs * rhs.y; \ - ret.z = lhs * rhs.z; \ - ret.w = lhs * rhs.w; \ - return ret; \ -} - -#define DECLOP_4VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - lhs.y op rhs.y; \ - lhs.z op rhs.z; \ - lhs.w op rhs.w; \ - return lhs; \ -} - -#define DECLOP_4VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - op val.y; \ - op val.z; \ - op val.w; \ - return val; \ -} - -#define DECLOP_4VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - ret.y = val.y; \ - ret.z = val.z; \ - ret.w = val.w; \ - val.x op; \ - val.y op; \ - val.z op; \ - val.w op; \ - return ret; \ -} - -#define DECLOP_4VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} - -#define DECLOP_4VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type &rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - ret.y = op rhs.y; \ - ret.z = op rhs.z; \ - ret.w = op rhs.w; \ - return ret; \ -} - -#define DECLOP_4VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type &rhs) { \ - return (op rhs.x) && (op rhs.y) && (op rhs.z) && (op rhs.w); \ -} - - -/* -Overloading operators -*/ - -// UNSIGNED CHAR1 - -DECLOP_1VAR_2IN_1OUT(uchar1, +) -DECLOP_1VAR_2IN_1OUT(uchar1, -) -DECLOP_1VAR_2IN_1OUT(uchar1, *) -DECLOP_1VAR_2IN_1OUT(uchar1, /) -DECLOP_1VAR_2IN_1OUT(uchar1, %) -DECLOP_1VAR_2IN_1OUT(uchar1, &) -DECLOP_1VAR_2IN_1OUT(uchar1, |) -DECLOP_1VAR_2IN_1OUT(uchar1, ^) -DECLOP_1VAR_2IN_1OUT(uchar1, <<) -DECLOP_1VAR_2IN_1OUT(uchar1, >>) - - -DECLOP_1VAR_ASSIGN(uchar1, +=) -DECLOP_1VAR_ASSIGN(uchar1, -=) -DECLOP_1VAR_ASSIGN(uchar1, *=) -DECLOP_1VAR_ASSIGN(uchar1, /=) -DECLOP_1VAR_ASSIGN(uchar1, %=) -DECLOP_1VAR_ASSIGN(uchar1, &=) -DECLOP_1VAR_ASSIGN(uchar1, |=) -DECLOP_1VAR_ASSIGN(uchar1, ^=) -DECLOP_1VAR_ASSIGN(uchar1, <<=) -DECLOP_1VAR_ASSIGN(uchar1, >>=) - -DECLOP_1VAR_PREOP(uchar1, ++) -DECLOP_1VAR_PREOP(uchar1, --) - -DECLOP_1VAR_POSTOP(uchar1, ++) -DECLOP_1VAR_POSTOP(uchar1, --) - -DECLOP_1VAR_COMP(uchar1, ==) -DECLOP_1VAR_COMP(uchar1, !=) -DECLOP_1VAR_COMP(uchar1, <) -DECLOP_1VAR_COMP(uchar1, >) -DECLOP_1VAR_COMP(uchar1, <=) -DECLOP_1VAR_COMP(uchar1, >=) - -DECLOP_1VAR_COMP(uchar1, &&) -DECLOP_1VAR_COMP(uchar1, ||) - -DECLOP_1VAR_1IN_1OUT(uchar1, ~) -DECLOP_1VAR_1IN_BOOLOUT(uchar1, !) - -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, float) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, double) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long long) - -// UNSIGNED CHAR2 - -DECLOP_2VAR_2IN_1OUT(uchar2, +) -DECLOP_2VAR_2IN_1OUT(uchar2, -) -DECLOP_2VAR_2IN_1OUT(uchar2, *) -DECLOP_2VAR_2IN_1OUT(uchar2, /) -DECLOP_2VAR_2IN_1OUT(uchar2, %) -DECLOP_2VAR_2IN_1OUT(uchar2, &) -DECLOP_2VAR_2IN_1OUT(uchar2, |) -DECLOP_2VAR_2IN_1OUT(uchar2, ^) -DECLOP_2VAR_2IN_1OUT(uchar2, <<) -DECLOP_2VAR_2IN_1OUT(uchar2, >>) - -DECLOP_2VAR_ASSIGN(uchar2, +=) -DECLOP_2VAR_ASSIGN(uchar2, -=) -DECLOP_2VAR_ASSIGN(uchar2, *=) -DECLOP_2VAR_ASSIGN(uchar2, /=) -DECLOP_2VAR_ASSIGN(uchar2, %=) -DECLOP_2VAR_ASSIGN(uchar2, &=) -DECLOP_2VAR_ASSIGN(uchar2, |=) -DECLOP_2VAR_ASSIGN(uchar2, ^=) -DECLOP_2VAR_ASSIGN(uchar2, <<=) -DECLOP_2VAR_ASSIGN(uchar2, >>=) - -DECLOP_2VAR_PREOP(uchar2, ++) -DECLOP_2VAR_PREOP(uchar2, --) - -DECLOP_2VAR_POSTOP(uchar2, ++) -DECLOP_2VAR_POSTOP(uchar2, --) - -DECLOP_2VAR_COMP(uchar2, ==) -DECLOP_2VAR_COMP(uchar2, !=) -DECLOP_2VAR_COMP(uchar2, <) -DECLOP_2VAR_COMP(uchar2, >) -DECLOP_2VAR_COMP(uchar2, <=) -DECLOP_2VAR_COMP(uchar2, >=) - -DECLOP_2VAR_COMP(uchar2, &&) -DECLOP_2VAR_COMP(uchar2, ||) - -DECLOP_2VAR_1IN_1OUT(uchar2, ~) -DECLOP_2VAR_1IN_BOOLOUT(uchar2, !) - -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, float) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, double) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long long) - -// UNSIGNED CHAR3 - -DECLOP_3VAR_2IN_1OUT(uchar3, +) -DECLOP_3VAR_2IN_1OUT(uchar3, -) -DECLOP_3VAR_2IN_1OUT(uchar3, *) -DECLOP_3VAR_2IN_1OUT(uchar3, /) -DECLOP_3VAR_2IN_1OUT(uchar3, %) -DECLOP_3VAR_2IN_1OUT(uchar3, &) -DECLOP_3VAR_2IN_1OUT(uchar3, |) -DECLOP_3VAR_2IN_1OUT(uchar3, ^) -DECLOP_3VAR_2IN_1OUT(uchar3, <<) -DECLOP_3VAR_2IN_1OUT(uchar3, >>) - -DECLOP_3VAR_ASSIGN(uchar3, +=) -DECLOP_3VAR_ASSIGN(uchar3, -=) -DECLOP_3VAR_ASSIGN(uchar3, *=) -DECLOP_3VAR_ASSIGN(uchar3, /=) -DECLOP_3VAR_ASSIGN(uchar3, %=) -DECLOP_3VAR_ASSIGN(uchar3, &=) -DECLOP_3VAR_ASSIGN(uchar3, |=) -DECLOP_3VAR_ASSIGN(uchar3, ^=) -DECLOP_3VAR_ASSIGN(uchar3, <<=) -DECLOP_3VAR_ASSIGN(uchar3, >>=) - -DECLOP_3VAR_PREOP(uchar3, ++) -DECLOP_3VAR_PREOP(uchar3, --) - -DECLOP_3VAR_POSTOP(uchar3, ++) -DECLOP_3VAR_POSTOP(uchar3, --) - -DECLOP_3VAR_COMP(uchar3, ==) -DECLOP_3VAR_COMP(uchar3, !=) -DECLOP_3VAR_COMP(uchar3, <) -DECLOP_3VAR_COMP(uchar3, >) -DECLOP_3VAR_COMP(uchar3, <=) -DECLOP_3VAR_COMP(uchar3, >=) - -DECLOP_3VAR_COMP(uchar3, &&) -DECLOP_3VAR_COMP(uchar3, ||) - -DECLOP_3VAR_1IN_1OUT(uchar3, ~) -DECLOP_3VAR_1IN_BOOLOUT(uchar3, !) - -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, float) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, double) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long long) - -// UNSIGNED CHAR4 - -DECLOP_4VAR_2IN_1OUT(uchar4, +) -DECLOP_4VAR_2IN_1OUT(uchar4, -) -DECLOP_4VAR_2IN_1OUT(uchar4, *) -DECLOP_4VAR_2IN_1OUT(uchar4, /) -DECLOP_4VAR_2IN_1OUT(uchar4, %) -DECLOP_4VAR_2IN_1OUT(uchar4, &) -DECLOP_4VAR_2IN_1OUT(uchar4, |) -DECLOP_4VAR_2IN_1OUT(uchar4, ^) -DECLOP_4VAR_2IN_1OUT(uchar4, <<) -DECLOP_4VAR_2IN_1OUT(uchar4, >>) - -DECLOP_4VAR_ASSIGN(uchar4, +=) -DECLOP_4VAR_ASSIGN(uchar4, -=) -DECLOP_4VAR_ASSIGN(uchar4, *=) -DECLOP_4VAR_ASSIGN(uchar4, /=) -DECLOP_4VAR_ASSIGN(uchar4, %=) -DECLOP_4VAR_ASSIGN(uchar4, &=) -DECLOP_4VAR_ASSIGN(uchar4, |=) -DECLOP_4VAR_ASSIGN(uchar4, ^=) -DECLOP_4VAR_ASSIGN(uchar4, <<=) -DECLOP_4VAR_ASSIGN(uchar4, >>=) - -DECLOP_4VAR_PREOP(uchar4, ++) -DECLOP_4VAR_PREOP(uchar4, --) - -DECLOP_4VAR_POSTOP(uchar4, ++) -DECLOP_4VAR_POSTOP(uchar4, --) - -DECLOP_4VAR_COMP(uchar4, ==) -DECLOP_4VAR_COMP(uchar4, !=) -DECLOP_4VAR_COMP(uchar4, <) -DECLOP_4VAR_COMP(uchar4, >) -DECLOP_4VAR_COMP(uchar4, <=) -DECLOP_4VAR_COMP(uchar4, >=) - -DECLOP_4VAR_COMP(uchar4, &&) -DECLOP_4VAR_COMP(uchar4, ||) - -DECLOP_4VAR_1IN_1OUT(uchar4, ~) -DECLOP_4VAR_1IN_BOOLOUT(uchar4, !) - -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, float) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, double) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long long) - -// SIGNED CHAR1 - -DECLOP_1VAR_2IN_1OUT(char1, +) -DECLOP_1VAR_2IN_1OUT(char1, -) -DECLOP_1VAR_2IN_1OUT(char1, *) -DECLOP_1VAR_2IN_1OUT(char1, /) -DECLOP_1VAR_2IN_1OUT(char1, %) -DECLOP_1VAR_2IN_1OUT(char1, &) -DECLOP_1VAR_2IN_1OUT(char1, |) -DECLOP_1VAR_2IN_1OUT(char1, ^) -DECLOP_1VAR_2IN_1OUT(char1, <<) -DECLOP_1VAR_2IN_1OUT(char1, >>) - - -DECLOP_1VAR_ASSIGN(char1, +=) -DECLOP_1VAR_ASSIGN(char1, -=) -DECLOP_1VAR_ASSIGN(char1, *=) -DECLOP_1VAR_ASSIGN(char1, /=) -DECLOP_1VAR_ASSIGN(char1, %=) -DECLOP_1VAR_ASSIGN(char1, &=) -DECLOP_1VAR_ASSIGN(char1, |=) -DECLOP_1VAR_ASSIGN(char1, ^=) -DECLOP_1VAR_ASSIGN(char1, <<=) -DECLOP_1VAR_ASSIGN(char1, >>=) - -DECLOP_1VAR_PREOP(char1, ++) -DECLOP_1VAR_PREOP(char1, --) - -DECLOP_1VAR_POSTOP(char1, ++) -DECLOP_1VAR_POSTOP(char1, --) - -DECLOP_1VAR_COMP(char1, ==) -DECLOP_1VAR_COMP(char1, !=) -DECLOP_1VAR_COMP(char1, <) -DECLOP_1VAR_COMP(char1, >) -DECLOP_1VAR_COMP(char1, <=) -DECLOP_1VAR_COMP(char1, >=) - -DECLOP_1VAR_COMP(char1, &&) -DECLOP_1VAR_COMP(char1, ||) - -DECLOP_1VAR_1IN_1OUT(char1, ~) -DECLOP_1VAR_1IN_BOOLOUT(char1, !) - -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(char1, float) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(char1, double) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed long long) - -// SIGNED CHAR2 - -DECLOP_2VAR_2IN_1OUT(char2, +) -DECLOP_2VAR_2IN_1OUT(char2, -) -DECLOP_2VAR_2IN_1OUT(char2, *) -DECLOP_2VAR_2IN_1OUT(char2, /) -DECLOP_2VAR_2IN_1OUT(char2, %) -DECLOP_2VAR_2IN_1OUT(char2, &) -DECLOP_2VAR_2IN_1OUT(char2, |) -DECLOP_2VAR_2IN_1OUT(char2, ^) -DECLOP_2VAR_2IN_1OUT(char2, <<) -DECLOP_2VAR_2IN_1OUT(char2, >>) - -DECLOP_2VAR_ASSIGN(char2, +=) -DECLOP_2VAR_ASSIGN(char2, -=) -DECLOP_2VAR_ASSIGN(char2, *=) -DECLOP_2VAR_ASSIGN(char2, /=) -DECLOP_2VAR_ASSIGN(char2, %=) -DECLOP_2VAR_ASSIGN(char2, &=) -DECLOP_2VAR_ASSIGN(char2, |=) -DECLOP_2VAR_ASSIGN(char2, ^=) -DECLOP_2VAR_ASSIGN(char2, <<=) -DECLOP_2VAR_ASSIGN(char2, >>=) - -DECLOP_2VAR_PREOP(char2, ++) -DECLOP_2VAR_PREOP(char2, --) - -DECLOP_2VAR_POSTOP(char2, ++) -DECLOP_2VAR_POSTOP(char2, --) - -DECLOP_2VAR_COMP(char2, ==) -DECLOP_2VAR_COMP(char2, !=) -DECLOP_2VAR_COMP(char2, <) -DECLOP_2VAR_COMP(char2, >) -DECLOP_2VAR_COMP(char2, <=) -DECLOP_2VAR_COMP(char2, >=) - -DECLOP_2VAR_COMP(char2, &&) -DECLOP_2VAR_COMP(char2, ||) - -DECLOP_2VAR_1IN_1OUT(char2, ~) -DECLOP_2VAR_1IN_BOOLOUT(char2, !) - -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(char2, float) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(char2, double) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed long long) - -// SIGNED CHAR3 - -DECLOP_3VAR_2IN_1OUT(char3, +) -DECLOP_3VAR_2IN_1OUT(char3, -) -DECLOP_3VAR_2IN_1OUT(char3, *) -DECLOP_3VAR_2IN_1OUT(char3, /) -DECLOP_3VAR_2IN_1OUT(char3, %) -DECLOP_3VAR_2IN_1OUT(char3, &) -DECLOP_3VAR_2IN_1OUT(char3, |) -DECLOP_3VAR_2IN_1OUT(char3, ^) -DECLOP_3VAR_2IN_1OUT(char3, <<) -DECLOP_3VAR_2IN_1OUT(char3, >>) - -DECLOP_3VAR_ASSIGN(char3, +=) -DECLOP_3VAR_ASSIGN(char3, -=) -DECLOP_3VAR_ASSIGN(char3, *=) -DECLOP_3VAR_ASSIGN(char3, /=) -DECLOP_3VAR_ASSIGN(char3, %=) -DECLOP_3VAR_ASSIGN(char3, &=) -DECLOP_3VAR_ASSIGN(char3, |=) -DECLOP_3VAR_ASSIGN(char3, ^=) -DECLOP_3VAR_ASSIGN(char3, <<=) -DECLOP_3VAR_ASSIGN(char3, >>=) - -DECLOP_3VAR_PREOP(char3, ++) -DECLOP_3VAR_PREOP(char3, --) - -DECLOP_3VAR_POSTOP(char3, ++) -DECLOP_3VAR_POSTOP(char3, --) - -DECLOP_3VAR_COMP(char3, ==) -DECLOP_3VAR_COMP(char3, !=) -DECLOP_3VAR_COMP(char3, <) -DECLOP_3VAR_COMP(char3, >) -DECLOP_3VAR_COMP(char3, <=) -DECLOP_3VAR_COMP(char3, >=) - -DECLOP_3VAR_COMP(char3, &&) -DECLOP_3VAR_COMP(char3, ||) - -DECLOP_3VAR_1IN_1OUT(char3, ~) -DECLOP_3VAR_1IN_BOOLOUT(char3, !) - -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(char3, float) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(char3, double) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed long long) - -// SIGNED CHAR4 - -DECLOP_4VAR_2IN_1OUT(char4, +) -DECLOP_4VAR_2IN_1OUT(char4, -) -DECLOP_4VAR_2IN_1OUT(char4, *) -DECLOP_4VAR_2IN_1OUT(char4, /) -DECLOP_4VAR_2IN_1OUT(char4, %) -DECLOP_4VAR_2IN_1OUT(char4, &) -DECLOP_4VAR_2IN_1OUT(char4, |) -DECLOP_4VAR_2IN_1OUT(char4, ^) -DECLOP_4VAR_2IN_1OUT(char4, <<) -DECLOP_4VAR_2IN_1OUT(char4, >>) - -DECLOP_4VAR_ASSIGN(char4, +=) -DECLOP_4VAR_ASSIGN(char4, -=) -DECLOP_4VAR_ASSIGN(char4, *=) -DECLOP_4VAR_ASSIGN(char4, /=) -DECLOP_4VAR_ASSIGN(char4, %=) -DECLOP_4VAR_ASSIGN(char4, &=) -DECLOP_4VAR_ASSIGN(char4, |=) -DECLOP_4VAR_ASSIGN(char4, ^=) -DECLOP_4VAR_ASSIGN(char4, <<=) -DECLOP_4VAR_ASSIGN(char4, >>=) - -DECLOP_4VAR_PREOP(char4, ++) -DECLOP_4VAR_PREOP(char4, --) - -DECLOP_4VAR_POSTOP(char4, ++) -DECLOP_4VAR_POSTOP(char4, --) - -DECLOP_4VAR_COMP(char4, ==) -DECLOP_4VAR_COMP(char4, !=) -DECLOP_4VAR_COMP(char4, <) -DECLOP_4VAR_COMP(char4, >) -DECLOP_4VAR_COMP(char4, <=) -DECLOP_4VAR_COMP(char4, >=) - -DECLOP_4VAR_COMP(char4, &&) -DECLOP_4VAR_COMP(char4, ||) - -DECLOP_4VAR_1IN_1OUT(char4, ~) -DECLOP_4VAR_1IN_BOOLOUT(char4, !) - -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(char4, float) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(char4, double) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed long long) - -// UNSIGNED SHORT1 - -DECLOP_1VAR_2IN_1OUT(ushort1, +) -DECLOP_1VAR_2IN_1OUT(ushort1, -) -DECLOP_1VAR_2IN_1OUT(ushort1, *) -DECLOP_1VAR_2IN_1OUT(ushort1, /) -DECLOP_1VAR_2IN_1OUT(ushort1, %) -DECLOP_1VAR_2IN_1OUT(ushort1, &) -DECLOP_1VAR_2IN_1OUT(ushort1, |) -DECLOP_1VAR_2IN_1OUT(ushort1, ^) -DECLOP_1VAR_2IN_1OUT(ushort1, <<) -DECLOP_1VAR_2IN_1OUT(ushort1, >>) - - -DECLOP_1VAR_ASSIGN(ushort1, +=) -DECLOP_1VAR_ASSIGN(ushort1, -=) -DECLOP_1VAR_ASSIGN(ushort1, *=) -DECLOP_1VAR_ASSIGN(ushort1, /=) -DECLOP_1VAR_ASSIGN(ushort1, %=) -DECLOP_1VAR_ASSIGN(ushort1, &=) -DECLOP_1VAR_ASSIGN(ushort1, |=) -DECLOP_1VAR_ASSIGN(ushort1, ^=) -DECLOP_1VAR_ASSIGN(ushort1, <<=) -DECLOP_1VAR_ASSIGN(ushort1, >>=) - -DECLOP_1VAR_PREOP(ushort1, ++) -DECLOP_1VAR_PREOP(ushort1, --) - -DECLOP_1VAR_POSTOP(ushort1, ++) -DECLOP_1VAR_POSTOP(ushort1, --) - -DECLOP_1VAR_COMP(ushort1, ==) -DECLOP_1VAR_COMP(ushort1, !=) -DECLOP_1VAR_COMP(ushort1, <) -DECLOP_1VAR_COMP(ushort1, >) -DECLOP_1VAR_COMP(ushort1, <=) -DECLOP_1VAR_COMP(ushort1, >=) - -DECLOP_1VAR_COMP(ushort1, &&) -DECLOP_1VAR_COMP(ushort1, ||) - -DECLOP_1VAR_1IN_1OUT(ushort1, ~) -DECLOP_1VAR_1IN_BOOLOUT(ushort1, !) - -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, float) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, double) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long long) - -// UNSIGNED SHORT2 - -DECLOP_2VAR_2IN_1OUT(ushort2, +) -DECLOP_2VAR_2IN_1OUT(ushort2, -) -DECLOP_2VAR_2IN_1OUT(ushort2, *) -DECLOP_2VAR_2IN_1OUT(ushort2, /) -DECLOP_2VAR_2IN_1OUT(ushort2, %) -DECLOP_2VAR_2IN_1OUT(ushort2, &) -DECLOP_2VAR_2IN_1OUT(ushort2, |) -DECLOP_2VAR_2IN_1OUT(ushort2, ^) -DECLOP_2VAR_2IN_1OUT(ushort2, <<) -DECLOP_2VAR_2IN_1OUT(ushort2, >>) - -DECLOP_2VAR_ASSIGN(ushort2, +=) -DECLOP_2VAR_ASSIGN(ushort2, -=) -DECLOP_2VAR_ASSIGN(ushort2, *=) -DECLOP_2VAR_ASSIGN(ushort2, /=) -DECLOP_2VAR_ASSIGN(ushort2, %=) -DECLOP_2VAR_ASSIGN(ushort2, &=) -DECLOP_2VAR_ASSIGN(ushort2, |=) -DECLOP_2VAR_ASSIGN(ushort2, ^=) -DECLOP_2VAR_ASSIGN(ushort2, <<=) -DECLOP_2VAR_ASSIGN(ushort2, >>=) - -DECLOP_2VAR_PREOP(ushort2, ++) -DECLOP_2VAR_PREOP(ushort2, --) - -DECLOP_2VAR_POSTOP(ushort2, ++) -DECLOP_2VAR_POSTOP(ushort2, --) - -DECLOP_2VAR_COMP(ushort2, ==) -DECLOP_2VAR_COMP(ushort2, !=) -DECLOP_2VAR_COMP(ushort2, <) -DECLOP_2VAR_COMP(ushort2, >) -DECLOP_2VAR_COMP(ushort2, <=) -DECLOP_2VAR_COMP(ushort2, >=) - -DECLOP_2VAR_COMP(ushort2, &&) -DECLOP_2VAR_COMP(ushort2, ||) - -DECLOP_2VAR_1IN_1OUT(ushort2, ~) -DECLOP_2VAR_1IN_BOOLOUT(ushort2, !) - -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, float) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, double) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long long) - -// UNSIGNED SHORT3 - -DECLOP_3VAR_2IN_1OUT(ushort3, +) -DECLOP_3VAR_2IN_1OUT(ushort3, -) -DECLOP_3VAR_2IN_1OUT(ushort3, *) -DECLOP_3VAR_2IN_1OUT(ushort3, /) -DECLOP_3VAR_2IN_1OUT(ushort3, %) -DECLOP_3VAR_2IN_1OUT(ushort3, &) -DECLOP_3VAR_2IN_1OUT(ushort3, |) -DECLOP_3VAR_2IN_1OUT(ushort3, ^) -DECLOP_3VAR_2IN_1OUT(ushort3, <<) -DECLOP_3VAR_2IN_1OUT(ushort3, >>) - -DECLOP_3VAR_ASSIGN(ushort3, +=) -DECLOP_3VAR_ASSIGN(ushort3, -=) -DECLOP_3VAR_ASSIGN(ushort3, *=) -DECLOP_3VAR_ASSIGN(ushort3, /=) -DECLOP_3VAR_ASSIGN(ushort3, %=) -DECLOP_3VAR_ASSIGN(ushort3, &=) -DECLOP_3VAR_ASSIGN(ushort3, |=) -DECLOP_3VAR_ASSIGN(ushort3, ^=) -DECLOP_3VAR_ASSIGN(ushort3, <<=) -DECLOP_3VAR_ASSIGN(ushort3, >>=) - -DECLOP_3VAR_PREOP(ushort3, ++) -DECLOP_3VAR_PREOP(ushort3, --) - -DECLOP_3VAR_POSTOP(ushort3, ++) -DECLOP_3VAR_POSTOP(ushort3, --) - -DECLOP_3VAR_COMP(ushort3, ==) -DECLOP_3VAR_COMP(ushort3, !=) -DECLOP_3VAR_COMP(ushort3, <) -DECLOP_3VAR_COMP(ushort3, >) -DECLOP_3VAR_COMP(ushort3, <=) -DECLOP_3VAR_COMP(ushort3, >=) - -DECLOP_3VAR_COMP(ushort3, &&) -DECLOP_3VAR_COMP(ushort3, ||) - -DECLOP_3VAR_1IN_1OUT(ushort3, ~) -DECLOP_3VAR_1IN_BOOLOUT(ushort3, !) - -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, float) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, double) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long long) - -// UNSIGNED SHORT4 - -DECLOP_4VAR_2IN_1OUT(ushort4, +) -DECLOP_4VAR_2IN_1OUT(ushort4, -) -DECLOP_4VAR_2IN_1OUT(ushort4, *) -DECLOP_4VAR_2IN_1OUT(ushort4, /) -DECLOP_4VAR_2IN_1OUT(ushort4, %) -DECLOP_4VAR_2IN_1OUT(ushort4, &) -DECLOP_4VAR_2IN_1OUT(ushort4, |) -DECLOP_4VAR_2IN_1OUT(ushort4, ^) -DECLOP_4VAR_2IN_1OUT(ushort4, <<) -DECLOP_4VAR_2IN_1OUT(ushort4, >>) - -DECLOP_4VAR_ASSIGN(ushort4, +=) -DECLOP_4VAR_ASSIGN(ushort4, -=) -DECLOP_4VAR_ASSIGN(ushort4, *=) -DECLOP_4VAR_ASSIGN(ushort4, /=) -DECLOP_4VAR_ASSIGN(ushort4, %=) -DECLOP_4VAR_ASSIGN(ushort4, &=) -DECLOP_4VAR_ASSIGN(ushort4, |=) -DECLOP_4VAR_ASSIGN(ushort4, ^=) -DECLOP_4VAR_ASSIGN(ushort4, <<=) -DECLOP_4VAR_ASSIGN(ushort4, >>=) - -DECLOP_4VAR_PREOP(ushort4, ++) -DECLOP_4VAR_PREOP(ushort4, --) - -DECLOP_4VAR_POSTOP(ushort4, ++) -DECLOP_4VAR_POSTOP(ushort4, --) - -DECLOP_4VAR_COMP(ushort4, ==) -DECLOP_4VAR_COMP(ushort4, !=) -DECLOP_4VAR_COMP(ushort4, <) -DECLOP_4VAR_COMP(ushort4, >) -DECLOP_4VAR_COMP(ushort4, <=) -DECLOP_4VAR_COMP(ushort4, >=) - -DECLOP_4VAR_COMP(ushort4, &&) -DECLOP_4VAR_COMP(ushort4, ||) - -DECLOP_4VAR_1IN_1OUT(ushort4, ~) -DECLOP_4VAR_1IN_BOOLOUT(ushort4, !) - -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, float) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, double) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long long) - -// SIGNED SHORT1 - -DECLOP_1VAR_2IN_1OUT(short1, +) -DECLOP_1VAR_2IN_1OUT(short1, -) -DECLOP_1VAR_2IN_1OUT(short1, *) -DECLOP_1VAR_2IN_1OUT(short1, /) -DECLOP_1VAR_2IN_1OUT(short1, %) -DECLOP_1VAR_2IN_1OUT(short1, &) -DECLOP_1VAR_2IN_1OUT(short1, |) -DECLOP_1VAR_2IN_1OUT(short1, ^) -DECLOP_1VAR_2IN_1OUT(short1, <<) -DECLOP_1VAR_2IN_1OUT(short1, >>) - - -DECLOP_1VAR_ASSIGN(short1, +=) -DECLOP_1VAR_ASSIGN(short1, -=) -DECLOP_1VAR_ASSIGN(short1, *=) -DECLOP_1VAR_ASSIGN(short1, /=) -DECLOP_1VAR_ASSIGN(short1, %=) -DECLOP_1VAR_ASSIGN(short1, &=) -DECLOP_1VAR_ASSIGN(short1, |=) -DECLOP_1VAR_ASSIGN(short1, ^=) -DECLOP_1VAR_ASSIGN(short1, <<=) -DECLOP_1VAR_ASSIGN(short1, >>=) - -DECLOP_1VAR_PREOP(short1, ++) -DECLOP_1VAR_PREOP(short1, --) - -DECLOP_1VAR_POSTOP(short1, ++) -DECLOP_1VAR_POSTOP(short1, --) - -DECLOP_1VAR_COMP(short1, ==) -DECLOP_1VAR_COMP(short1, !=) -DECLOP_1VAR_COMP(short1, <) -DECLOP_1VAR_COMP(short1, >) -DECLOP_1VAR_COMP(short1, <=) -DECLOP_1VAR_COMP(short1, >=) - -DECLOP_1VAR_COMP(short1, &&) -DECLOP_1VAR_COMP(short1, ||) - -DECLOP_1VAR_1IN_1OUT(short1, ~) -DECLOP_1VAR_1IN_BOOLOUT(short1, !) - -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(short1, float) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(short1, double) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed long long) - -// SIGNED SHORT2 - -DECLOP_2VAR_2IN_1OUT(short2, +) -DECLOP_2VAR_2IN_1OUT(short2, -) -DECLOP_2VAR_2IN_1OUT(short2, *) -DECLOP_2VAR_2IN_1OUT(short2, /) -DECLOP_2VAR_2IN_1OUT(short2, %) -DECLOP_2VAR_2IN_1OUT(short2, &) -DECLOP_2VAR_2IN_1OUT(short2, |) -DECLOP_2VAR_2IN_1OUT(short2, ^) -DECLOP_2VAR_2IN_1OUT(short2, <<) -DECLOP_2VAR_2IN_1OUT(short2, >>) - -DECLOP_2VAR_ASSIGN(short2, +=) -DECLOP_2VAR_ASSIGN(short2, -=) -DECLOP_2VAR_ASSIGN(short2, *=) -DECLOP_2VAR_ASSIGN(short2, /=) -DECLOP_2VAR_ASSIGN(short2, %=) -DECLOP_2VAR_ASSIGN(short2, &=) -DECLOP_2VAR_ASSIGN(short2, |=) -DECLOP_2VAR_ASSIGN(short2, ^=) -DECLOP_2VAR_ASSIGN(short2, <<=) -DECLOP_2VAR_ASSIGN(short2, >>=) - -DECLOP_2VAR_PREOP(short2, ++) -DECLOP_2VAR_PREOP(short2, --) - -DECLOP_2VAR_POSTOP(short2, ++) -DECLOP_2VAR_POSTOP(short2, --) - -DECLOP_2VAR_COMP(short2, ==) -DECLOP_2VAR_COMP(short2, !=) -DECLOP_2VAR_COMP(short2, <) -DECLOP_2VAR_COMP(short2, >) -DECLOP_2VAR_COMP(short2, <=) -DECLOP_2VAR_COMP(short2, >=) - -DECLOP_2VAR_COMP(short2, &&) -DECLOP_2VAR_COMP(short2, ||) - -DECLOP_2VAR_1IN_1OUT(short2, ~) -DECLOP_2VAR_1IN_BOOLOUT(short2, !) - -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(short2, float) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(short2, double) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed long long) - -// SIGNED SHORT3 - -DECLOP_3VAR_2IN_1OUT(short3, +) -DECLOP_3VAR_2IN_1OUT(short3, -) -DECLOP_3VAR_2IN_1OUT(short3, *) -DECLOP_3VAR_2IN_1OUT(short3, /) -DECLOP_3VAR_2IN_1OUT(short3, %) -DECLOP_3VAR_2IN_1OUT(short3, &) -DECLOP_3VAR_2IN_1OUT(short3, |) -DECLOP_3VAR_2IN_1OUT(short3, ^) -DECLOP_3VAR_2IN_1OUT(short3, <<) -DECLOP_3VAR_2IN_1OUT(short3, >>) - -DECLOP_3VAR_ASSIGN(short3, +=) -DECLOP_3VAR_ASSIGN(short3, -=) -DECLOP_3VAR_ASSIGN(short3, *=) -DECLOP_3VAR_ASSIGN(short3, /=) -DECLOP_3VAR_ASSIGN(short3, %=) -DECLOP_3VAR_ASSIGN(short3, &=) -DECLOP_3VAR_ASSIGN(short3, |=) -DECLOP_3VAR_ASSIGN(short3, ^=) -DECLOP_3VAR_ASSIGN(short3, <<=) -DECLOP_3VAR_ASSIGN(short3, >>=) - -DECLOP_3VAR_PREOP(short3, ++) -DECLOP_3VAR_PREOP(short3, --) - -DECLOP_3VAR_POSTOP(short3, ++) -DECLOP_3VAR_POSTOP(short3, --) - -DECLOP_3VAR_COMP(short3, ==) -DECLOP_3VAR_COMP(short3, !=) -DECLOP_3VAR_COMP(short3, <) -DECLOP_3VAR_COMP(short3, >) -DECLOP_3VAR_COMP(short3, <=) -DECLOP_3VAR_COMP(short3, >=) - -DECLOP_3VAR_COMP(short3, &&) -DECLOP_3VAR_COMP(short3, ||) - -DECLOP_3VAR_1IN_1OUT(short3, ~) -DECLOP_3VAR_1IN_BOOLOUT(short3, !) - -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(short3, float) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(short3, double) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed long long) - -// SIGNED SHORT4 - -DECLOP_4VAR_2IN_1OUT(short4, +) -DECLOP_4VAR_2IN_1OUT(short4, -) -DECLOP_4VAR_2IN_1OUT(short4, *) -DECLOP_4VAR_2IN_1OUT(short4, /) -DECLOP_4VAR_2IN_1OUT(short4, %) -DECLOP_4VAR_2IN_1OUT(short4, &) -DECLOP_4VAR_2IN_1OUT(short4, |) -DECLOP_4VAR_2IN_1OUT(short4, ^) -DECLOP_4VAR_2IN_1OUT(short4, <<) -DECLOP_4VAR_2IN_1OUT(short4, >>) - -DECLOP_4VAR_ASSIGN(short4, +=) -DECLOP_4VAR_ASSIGN(short4, -=) -DECLOP_4VAR_ASSIGN(short4, *=) -DECLOP_4VAR_ASSIGN(short4, /=) -DECLOP_4VAR_ASSIGN(short4, %=) -DECLOP_4VAR_ASSIGN(short4, &=) -DECLOP_4VAR_ASSIGN(short4, |=) -DECLOP_4VAR_ASSIGN(short4, ^=) -DECLOP_4VAR_ASSIGN(short4, <<=) -DECLOP_4VAR_ASSIGN(short4, >>=) - -DECLOP_4VAR_PREOP(short4, ++) -DECLOP_4VAR_PREOP(short4, --) - -DECLOP_4VAR_POSTOP(short4, ++) -DECLOP_4VAR_POSTOP(short4, --) - -DECLOP_4VAR_COMP(short4, ==) -DECLOP_4VAR_COMP(short4, !=) -DECLOP_4VAR_COMP(short4, <) -DECLOP_4VAR_COMP(short4, >) -DECLOP_4VAR_COMP(short4, <=) -DECLOP_4VAR_COMP(short4, >=) - -DECLOP_4VAR_COMP(short4, &&) -DECLOP_4VAR_COMP(short4, ||) - -DECLOP_4VAR_1IN_1OUT(short4, ~) -DECLOP_4VAR_1IN_BOOLOUT(short4, !) - -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(short4, float) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(short4, double) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed long long) - -// UNSIGNED INT1 - -DECLOP_1VAR_2IN_1OUT(uint1, +) -DECLOP_1VAR_2IN_1OUT(uint1, -) -DECLOP_1VAR_2IN_1OUT(uint1, *) -DECLOP_1VAR_2IN_1OUT(uint1, /) -DECLOP_1VAR_2IN_1OUT(uint1, %) -DECLOP_1VAR_2IN_1OUT(uint1, &) -DECLOP_1VAR_2IN_1OUT(uint1, |) -DECLOP_1VAR_2IN_1OUT(uint1, ^) -DECLOP_1VAR_2IN_1OUT(uint1, <<) -DECLOP_1VAR_2IN_1OUT(uint1, >>) - - -DECLOP_1VAR_ASSIGN(uint1, +=) -DECLOP_1VAR_ASSIGN(uint1, -=) -DECLOP_1VAR_ASSIGN(uint1, *=) -DECLOP_1VAR_ASSIGN(uint1, /=) -DECLOP_1VAR_ASSIGN(uint1, %=) -DECLOP_1VAR_ASSIGN(uint1, &=) -DECLOP_1VAR_ASSIGN(uint1, |=) -DECLOP_1VAR_ASSIGN(uint1, ^=) -DECLOP_1VAR_ASSIGN(uint1, <<=) -DECLOP_1VAR_ASSIGN(uint1, >>=) - -DECLOP_1VAR_PREOP(uint1, ++) -DECLOP_1VAR_PREOP(uint1, --) - -DECLOP_1VAR_POSTOP(uint1, ++) -DECLOP_1VAR_POSTOP(uint1, --) - -DECLOP_1VAR_COMP(uint1, ==) -DECLOP_1VAR_COMP(uint1, !=) -DECLOP_1VAR_COMP(uint1, <) -DECLOP_1VAR_COMP(uint1, >) -DECLOP_1VAR_COMP(uint1, <=) -DECLOP_1VAR_COMP(uint1, >=) - -DECLOP_1VAR_COMP(uint1, &&) -DECLOP_1VAR_COMP(uint1, ||) - -DECLOP_1VAR_1IN_1OUT(uint1, ~) -DECLOP_1VAR_1IN_BOOLOUT(uint1, !) - -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(uint1, float) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(uint1, double) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long long) - -// UNSIGNED INT2 - -DECLOP_2VAR_2IN_1OUT(uint2, +) -DECLOP_2VAR_2IN_1OUT(uint2, -) -DECLOP_2VAR_2IN_1OUT(uint2, *) -DECLOP_2VAR_2IN_1OUT(uint2, /) -DECLOP_2VAR_2IN_1OUT(uint2, %) -DECLOP_2VAR_2IN_1OUT(uint2, &) -DECLOP_2VAR_2IN_1OUT(uint2, |) -DECLOP_2VAR_2IN_1OUT(uint2, ^) -DECLOP_2VAR_2IN_1OUT(uint2, <<) -DECLOP_2VAR_2IN_1OUT(uint2, >>) - -DECLOP_2VAR_ASSIGN(uint2, +=) -DECLOP_2VAR_ASSIGN(uint2, -=) -DECLOP_2VAR_ASSIGN(uint2, *=) -DECLOP_2VAR_ASSIGN(uint2, /=) -DECLOP_2VAR_ASSIGN(uint2, %=) -DECLOP_2VAR_ASSIGN(uint2, &=) -DECLOP_2VAR_ASSIGN(uint2, |=) -DECLOP_2VAR_ASSIGN(uint2, ^=) -DECLOP_2VAR_ASSIGN(uint2, <<=) -DECLOP_2VAR_ASSIGN(uint2, >>=) - -DECLOP_2VAR_PREOP(uint2, ++) -DECLOP_2VAR_PREOP(uint2, --) - -DECLOP_2VAR_POSTOP(uint2, ++) -DECLOP_2VAR_POSTOP(uint2, --) - -DECLOP_2VAR_COMP(uint2, ==) -DECLOP_2VAR_COMP(uint2, !=) -DECLOP_2VAR_COMP(uint2, <) -DECLOP_2VAR_COMP(uint2, >) -DECLOP_2VAR_COMP(uint2, <=) -DECLOP_2VAR_COMP(uint2, >=) - -DECLOP_2VAR_COMP(uint2, &&) -DECLOP_2VAR_COMP(uint2, ||) - -DECLOP_2VAR_1IN_1OUT(uint2, ~) -DECLOP_2VAR_1IN_BOOLOUT(uint2, !) - -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(uint2, float) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(uint2, double) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long long) - -// UNSIGNED INT3 - -DECLOP_3VAR_2IN_1OUT(uint3, +) -DECLOP_3VAR_2IN_1OUT(uint3, -) -DECLOP_3VAR_2IN_1OUT(uint3, *) -DECLOP_3VAR_2IN_1OUT(uint3, /) -DECLOP_3VAR_2IN_1OUT(uint3, %) -DECLOP_3VAR_2IN_1OUT(uint3, &) -DECLOP_3VAR_2IN_1OUT(uint3, |) -DECLOP_3VAR_2IN_1OUT(uint3, ^) -DECLOP_3VAR_2IN_1OUT(uint3, <<) -DECLOP_3VAR_2IN_1OUT(uint3, >>) - -DECLOP_3VAR_ASSIGN(uint3, +=) -DECLOP_3VAR_ASSIGN(uint3, -=) -DECLOP_3VAR_ASSIGN(uint3, *=) -DECLOP_3VAR_ASSIGN(uint3, /=) -DECLOP_3VAR_ASSIGN(uint3, %=) -DECLOP_3VAR_ASSIGN(uint3, &=) -DECLOP_3VAR_ASSIGN(uint3, |=) -DECLOP_3VAR_ASSIGN(uint3, ^=) -DECLOP_3VAR_ASSIGN(uint3, <<=) -DECLOP_3VAR_ASSIGN(uint3, >>=) - -DECLOP_3VAR_PREOP(uint3, ++) -DECLOP_3VAR_PREOP(uint3, --) - -DECLOP_3VAR_POSTOP(uint3, ++) -DECLOP_3VAR_POSTOP(uint3, --) - -DECLOP_3VAR_COMP(uint3, ==) -DECLOP_3VAR_COMP(uint3, !=) -DECLOP_3VAR_COMP(uint3, <) -DECLOP_3VAR_COMP(uint3, >) -DECLOP_3VAR_COMP(uint3, <=) -DECLOP_3VAR_COMP(uint3, >=) - -DECLOP_3VAR_COMP(uint3, &&) -DECLOP_3VAR_COMP(uint3, ||) - -DECLOP_3VAR_1IN_1OUT(uint3, ~) -DECLOP_3VAR_1IN_BOOLOUT(uint3, !) - -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(uint3, float) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(uint3, double) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long long) - -// UNSIGNED INT4 - -DECLOP_4VAR_2IN_1OUT(uint4, +) -DECLOP_4VAR_2IN_1OUT(uint4, -) -DECLOP_4VAR_2IN_1OUT(uint4, *) -DECLOP_4VAR_2IN_1OUT(uint4, /) -DECLOP_4VAR_2IN_1OUT(uint4, %) -DECLOP_4VAR_2IN_1OUT(uint4, &) -DECLOP_4VAR_2IN_1OUT(uint4, |) -DECLOP_4VAR_2IN_1OUT(uint4, ^) -DECLOP_4VAR_2IN_1OUT(uint4, <<) -DECLOP_4VAR_2IN_1OUT(uint4, >>) - -DECLOP_4VAR_ASSIGN(uint4, +=) -DECLOP_4VAR_ASSIGN(uint4, -=) -DECLOP_4VAR_ASSIGN(uint4, *=) -DECLOP_4VAR_ASSIGN(uint4, /=) -DECLOP_4VAR_ASSIGN(uint4, %=) -DECLOP_4VAR_ASSIGN(uint4, &=) -DECLOP_4VAR_ASSIGN(uint4, |=) -DECLOP_4VAR_ASSIGN(uint4, ^=) -DECLOP_4VAR_ASSIGN(uint4, <<=) -DECLOP_4VAR_ASSIGN(uint4, >>=) - -DECLOP_4VAR_PREOP(uint4, ++) -DECLOP_4VAR_PREOP(uint4, --) - -DECLOP_4VAR_POSTOP(uint4, ++) -DECLOP_4VAR_POSTOP(uint4, --) - -DECLOP_4VAR_COMP(uint4, ==) -DECLOP_4VAR_COMP(uint4, !=) -DECLOP_4VAR_COMP(uint4, <) -DECLOP_4VAR_COMP(uint4, >) -DECLOP_4VAR_COMP(uint4, <=) -DECLOP_4VAR_COMP(uint4, >=) - -DECLOP_4VAR_COMP(uint4, &&) -DECLOP_4VAR_COMP(uint4, ||) - -DECLOP_4VAR_1IN_1OUT(uint4, ~) -DECLOP_4VAR_1IN_BOOLOUT(uint4, !) - -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(uint4, float) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(uint4, double) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long long) - -// SIGNED INT1 - -DECLOP_1VAR_2IN_1OUT(int1, +) -DECLOP_1VAR_2IN_1OUT(int1, -) -DECLOP_1VAR_2IN_1OUT(int1, *) -DECLOP_1VAR_2IN_1OUT(int1, /) -DECLOP_1VAR_2IN_1OUT(int1, %) -DECLOP_1VAR_2IN_1OUT(int1, &) -DECLOP_1VAR_2IN_1OUT(int1, |) -DECLOP_1VAR_2IN_1OUT(int1, ^) -DECLOP_1VAR_2IN_1OUT(int1, <<) -DECLOP_1VAR_2IN_1OUT(int1, >>) - - -DECLOP_1VAR_ASSIGN(int1, +=) -DECLOP_1VAR_ASSIGN(int1, -=) -DECLOP_1VAR_ASSIGN(int1, *=) -DECLOP_1VAR_ASSIGN(int1, /=) -DECLOP_1VAR_ASSIGN(int1, %=) -DECLOP_1VAR_ASSIGN(int1, &=) -DECLOP_1VAR_ASSIGN(int1, |=) -DECLOP_1VAR_ASSIGN(int1, ^=) -DECLOP_1VAR_ASSIGN(int1, <<=) -DECLOP_1VAR_ASSIGN(int1, >>=) - -DECLOP_1VAR_PREOP(int1, ++) -DECLOP_1VAR_PREOP(int1, --) - -DECLOP_1VAR_POSTOP(int1, ++) -DECLOP_1VAR_POSTOP(int1, --) - -DECLOP_1VAR_COMP(int1, ==) -DECLOP_1VAR_COMP(int1, !=) -DECLOP_1VAR_COMP(int1, <) -DECLOP_1VAR_COMP(int1, >) -DECLOP_1VAR_COMP(int1, <=) -DECLOP_1VAR_COMP(int1, >=) - -DECLOP_1VAR_COMP(int1, &&) -DECLOP_1VAR_COMP(int1, ||) - -DECLOP_1VAR_1IN_1OUT(int1, ~) -DECLOP_1VAR_1IN_BOOLOUT(int1, !) - -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(int1, float) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(int1, double) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed long long) - -// SIGNED INT2 - -DECLOP_2VAR_2IN_1OUT(int2, +) -DECLOP_2VAR_2IN_1OUT(int2, -) -DECLOP_2VAR_2IN_1OUT(int2, *) -DECLOP_2VAR_2IN_1OUT(int2, /) -DECLOP_2VAR_2IN_1OUT(int2, %) -DECLOP_2VAR_2IN_1OUT(int2, &) -DECLOP_2VAR_2IN_1OUT(int2, |) -DECLOP_2VAR_2IN_1OUT(int2, ^) -DECLOP_2VAR_2IN_1OUT(int2, <<) -DECLOP_2VAR_2IN_1OUT(int2, >>) - -DECLOP_2VAR_ASSIGN(int2, +=) -DECLOP_2VAR_ASSIGN(int2, -=) -DECLOP_2VAR_ASSIGN(int2, *=) -DECLOP_2VAR_ASSIGN(int2, /=) -DECLOP_2VAR_ASSIGN(int2, %=) -DECLOP_2VAR_ASSIGN(int2, &=) -DECLOP_2VAR_ASSIGN(int2, |=) -DECLOP_2VAR_ASSIGN(int2, ^=) -DECLOP_2VAR_ASSIGN(int2, <<=) -DECLOP_2VAR_ASSIGN(int2, >>=) - -DECLOP_2VAR_PREOP(int2, ++) -DECLOP_2VAR_PREOP(int2, --) - -DECLOP_2VAR_POSTOP(int2, ++) -DECLOP_2VAR_POSTOP(int2, --) - -DECLOP_2VAR_COMP(int2, ==) -DECLOP_2VAR_COMP(int2, !=) -DECLOP_2VAR_COMP(int2, <) -DECLOP_2VAR_COMP(int2, >) -DECLOP_2VAR_COMP(int2, <=) -DECLOP_2VAR_COMP(int2, >=) - -DECLOP_2VAR_COMP(int2, &&) -DECLOP_2VAR_COMP(int2, ||) - -DECLOP_2VAR_1IN_1OUT(int2, ~) -DECLOP_2VAR_1IN_BOOLOUT(int2, !) - -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(int2, float) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(int2, double) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed long long) - -// SIGNED INT3 - -DECLOP_3VAR_2IN_1OUT(int3, +) -DECLOP_3VAR_2IN_1OUT(int3, -) -DECLOP_3VAR_2IN_1OUT(int3, *) -DECLOP_3VAR_2IN_1OUT(int3, /) -DECLOP_3VAR_2IN_1OUT(int3, %) -DECLOP_3VAR_2IN_1OUT(int3, &) -DECLOP_3VAR_2IN_1OUT(int3, |) -DECLOP_3VAR_2IN_1OUT(int3, ^) -DECLOP_3VAR_2IN_1OUT(int3, <<) -DECLOP_3VAR_2IN_1OUT(int3, >>) - -DECLOP_3VAR_ASSIGN(int3, +=) -DECLOP_3VAR_ASSIGN(int3, -=) -DECLOP_3VAR_ASSIGN(int3, *=) -DECLOP_3VAR_ASSIGN(int3, /=) -DECLOP_3VAR_ASSIGN(int3, %=) -DECLOP_3VAR_ASSIGN(int3, &=) -DECLOP_3VAR_ASSIGN(int3, |=) -DECLOP_3VAR_ASSIGN(int3, ^=) -DECLOP_3VAR_ASSIGN(int3, <<=) -DECLOP_3VAR_ASSIGN(int3, >>=) - -DECLOP_3VAR_PREOP(int3, ++) -DECLOP_3VAR_PREOP(int3, --) - -DECLOP_3VAR_POSTOP(int3, ++) -DECLOP_3VAR_POSTOP(int3, --) - -DECLOP_3VAR_COMP(int3, ==) -DECLOP_3VAR_COMP(int3, !=) -DECLOP_3VAR_COMP(int3, <) -DECLOP_3VAR_COMP(int3, >) -DECLOP_3VAR_COMP(int3, <=) -DECLOP_3VAR_COMP(int3, >=) - -DECLOP_3VAR_COMP(int3, &&) -DECLOP_3VAR_COMP(int3, ||) - -DECLOP_3VAR_1IN_1OUT(int3, ~) -DECLOP_3VAR_1IN_BOOLOUT(int3, !) - -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(int3, float) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(int3, double) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed long long) - -// SIGNED INT4 - -DECLOP_4VAR_2IN_1OUT(int4, +) -DECLOP_4VAR_2IN_1OUT(int4, -) -DECLOP_4VAR_2IN_1OUT(int4, *) -DECLOP_4VAR_2IN_1OUT(int4, /) -DECLOP_4VAR_2IN_1OUT(int4, %) -DECLOP_4VAR_2IN_1OUT(int4, &) -DECLOP_4VAR_2IN_1OUT(int4, |) -DECLOP_4VAR_2IN_1OUT(int4, ^) -DECLOP_4VAR_2IN_1OUT(int4, <<) -DECLOP_4VAR_2IN_1OUT(int4, >>) - -DECLOP_4VAR_ASSIGN(int4, +=) -DECLOP_4VAR_ASSIGN(int4, -=) -DECLOP_4VAR_ASSIGN(int4, *=) -DECLOP_4VAR_ASSIGN(int4, /=) -DECLOP_4VAR_ASSIGN(int4, %=) -DECLOP_4VAR_ASSIGN(int4, &=) -DECLOP_4VAR_ASSIGN(int4, |=) -DECLOP_4VAR_ASSIGN(int4, ^=) -DECLOP_4VAR_ASSIGN(int4, <<=) -DECLOP_4VAR_ASSIGN(int4, >>=) - -DECLOP_4VAR_PREOP(int4, ++) -DECLOP_4VAR_PREOP(int4, --) - -DECLOP_4VAR_POSTOP(int4, ++) -DECLOP_4VAR_POSTOP(int4, --) - -DECLOP_4VAR_COMP(int4, ==) -DECLOP_4VAR_COMP(int4, !=) -DECLOP_4VAR_COMP(int4, <) -DECLOP_4VAR_COMP(int4, >) -DECLOP_4VAR_COMP(int4, <=) -DECLOP_4VAR_COMP(int4, >=) - -DECLOP_4VAR_COMP(int4, &&) -DECLOP_4VAR_COMP(int4, ||) - -DECLOP_4VAR_1IN_1OUT(int4, ~) -DECLOP_4VAR_1IN_BOOLOUT(int4, !) - -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(int4, float) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(int4, double) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed long long) - -// FLOAT1 - -DECLOP_1VAR_2IN_1OUT(float1, +) -DECLOP_1VAR_2IN_1OUT(float1, -) -DECLOP_1VAR_2IN_1OUT(float1, *) -DECLOP_1VAR_2IN_1OUT(float1, /) - -DECLOP_1VAR_ASSIGN(float1, +=) -DECLOP_1VAR_ASSIGN(float1, -=) -DECLOP_1VAR_ASSIGN(float1, *=) -DECLOP_1VAR_ASSIGN(float1, /=) - -DECLOP_1VAR_PREOP(float1, ++) -DECLOP_1VAR_PREOP(float1, --) - -DECLOP_1VAR_POSTOP(float1, ++) -DECLOP_1VAR_POSTOP(float1, --) - -DECLOP_1VAR_COMP(float1, ==) -DECLOP_1VAR_COMP(float1, !=) -DECLOP_1VAR_COMP(float1, <) -DECLOP_1VAR_COMP(float1, >) -DECLOP_1VAR_COMP(float1, <=) -DECLOP_1VAR_COMP(float1, >=) - -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(float1, float) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(float1, double) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed long long) - -// FLOAT2 - -DECLOP_2VAR_2IN_1OUT(float2, +) -DECLOP_2VAR_2IN_1OUT(float2, -) -DECLOP_2VAR_2IN_1OUT(float2, *) -DECLOP_2VAR_2IN_1OUT(float2, /) - -DECLOP_2VAR_ASSIGN(float2, +=) -DECLOP_2VAR_ASSIGN(float2, -=) -DECLOP_2VAR_ASSIGN(float2, *=) -DECLOP_2VAR_ASSIGN(float2, /=) - -DECLOP_2VAR_PREOP(float2, ++) -DECLOP_2VAR_PREOP(float2, --) - -DECLOP_2VAR_POSTOP(float2, ++) -DECLOP_2VAR_POSTOP(float2, --) - -DECLOP_2VAR_COMP(float2, ==) -DECLOP_2VAR_COMP(float2, !=) -DECLOP_2VAR_COMP(float2, <) -DECLOP_2VAR_COMP(float2, >) -DECLOP_2VAR_COMP(float2, <=) -DECLOP_2VAR_COMP(float2, >=) - -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(float2, float) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(float2, double) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed long long) - -// FLOAT3 - -DECLOP_3VAR_2IN_1OUT(float3, +) -DECLOP_3VAR_2IN_1OUT(float3, -) -DECLOP_3VAR_2IN_1OUT(float3, *) -DECLOP_3VAR_2IN_1OUT(float3, /) - -DECLOP_3VAR_ASSIGN(float3, +=) -DECLOP_3VAR_ASSIGN(float3, -=) -DECLOP_3VAR_ASSIGN(float3, *=) -DECLOP_3VAR_ASSIGN(float3, /=) - -DECLOP_3VAR_PREOP(float3, ++) -DECLOP_3VAR_PREOP(float3, --) - -DECLOP_3VAR_POSTOP(float3, ++) -DECLOP_3VAR_POSTOP(float3, --) - -DECLOP_3VAR_COMP(float3, ==) -DECLOP_3VAR_COMP(float3, !=) -DECLOP_3VAR_COMP(float3, <) -DECLOP_3VAR_COMP(float3, >) -DECLOP_3VAR_COMP(float3, <=) -DECLOP_3VAR_COMP(float3, >=) - -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(float3, float) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(float3, double) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed long long) - -// FLOAT4 - -DECLOP_4VAR_2IN_1OUT(float4, +) -DECLOP_4VAR_2IN_1OUT(float4, -) -DECLOP_4VAR_2IN_1OUT(float4, *) -DECLOP_4VAR_2IN_1OUT(float4, /) - -DECLOP_4VAR_ASSIGN(float4, +=) -DECLOP_4VAR_ASSIGN(float4, -=) -DECLOP_4VAR_ASSIGN(float4, *=) -DECLOP_4VAR_ASSIGN(float4, /=) - -DECLOP_4VAR_PREOP(float4, ++) -DECLOP_4VAR_PREOP(float4, --) - -DECLOP_4VAR_POSTOP(float4, ++) -DECLOP_4VAR_POSTOP(float4, --) - -DECLOP_4VAR_COMP(float4, ==) -DECLOP_4VAR_COMP(float4, !=) -DECLOP_4VAR_COMP(float4, <) -DECLOP_4VAR_COMP(float4, >) -DECLOP_4VAR_COMP(float4, <=) -DECLOP_4VAR_COMP(float4, >=) - -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(float4, float) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(float4, double) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed long long) - -// DOUBLE1 - -DECLOP_1VAR_2IN_1OUT(double1, +) -DECLOP_1VAR_2IN_1OUT(double1, -) -DECLOP_1VAR_2IN_1OUT(double1, *) -DECLOP_1VAR_2IN_1OUT(double1, /) - -DECLOP_1VAR_ASSIGN(double1, +=) -DECLOP_1VAR_ASSIGN(double1, -=) -DECLOP_1VAR_ASSIGN(double1, *=) -DECLOP_1VAR_ASSIGN(double1, /=) - -DECLOP_1VAR_PREOP(double1, ++) -DECLOP_1VAR_PREOP(double1, --) - -DECLOP_1VAR_POSTOP(double1, ++) -DECLOP_1VAR_POSTOP(double1, --) - -DECLOP_1VAR_COMP(double1, ==) -DECLOP_1VAR_COMP(double1, !=) -DECLOP_1VAR_COMP(double1, <) -DECLOP_1VAR_COMP(double1, >) -DECLOP_1VAR_COMP(double1, <=) -DECLOP_1VAR_COMP(double1, >=) - -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(double1, float) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(double1, double) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed long long) - -// DOUBLE2 - -DECLOP_2VAR_2IN_1OUT(double2, +) -DECLOP_2VAR_2IN_1OUT(double2, -) -DECLOP_2VAR_2IN_1OUT(double2, *) -DECLOP_2VAR_2IN_1OUT(double2, /) - -DECLOP_2VAR_ASSIGN(double2, +=) -DECLOP_2VAR_ASSIGN(double2, -=) -DECLOP_2VAR_ASSIGN(double2, *=) -DECLOP_2VAR_ASSIGN(double2, /=) - -DECLOP_2VAR_PREOP(double2, ++) -DECLOP_2VAR_PREOP(double2, --) - -DECLOP_2VAR_POSTOP(double2, ++) -DECLOP_2VAR_POSTOP(double2, --) - -DECLOP_2VAR_COMP(double2, ==) -DECLOP_2VAR_COMP(double2, !=) -DECLOP_2VAR_COMP(double2, <) -DECLOP_2VAR_COMP(double2, >) -DECLOP_2VAR_COMP(double2, <=) -DECLOP_2VAR_COMP(double2, >=) - -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(double2, float) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(double2, double) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed long long) - -// DOUBLE3 - -DECLOP_3VAR_2IN_1OUT(double3, +) -DECLOP_3VAR_2IN_1OUT(double3, -) -DECLOP_3VAR_2IN_1OUT(double3, *) -DECLOP_3VAR_2IN_1OUT(double3, /) - -DECLOP_3VAR_ASSIGN(double3, +=) -DECLOP_3VAR_ASSIGN(double3, -=) -DECLOP_3VAR_ASSIGN(double3, *=) -DECLOP_3VAR_ASSIGN(double3, /=) - -DECLOP_3VAR_PREOP(double3, ++) -DECLOP_3VAR_PREOP(double3, --) - -DECLOP_3VAR_POSTOP(double3, ++) -DECLOP_3VAR_POSTOP(double3, --) - -DECLOP_3VAR_COMP(double3, ==) -DECLOP_3VAR_COMP(double3, !=) -DECLOP_3VAR_COMP(double3, <) -DECLOP_3VAR_COMP(double3, >) -DECLOP_3VAR_COMP(double3, <=) -DECLOP_3VAR_COMP(double3, >=) - -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(double3, float) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(double3, double) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed long long) - -// DOUBLE4 - -DECLOP_4VAR_2IN_1OUT(double4, +) -DECLOP_4VAR_2IN_1OUT(double4, -) -DECLOP_4VAR_2IN_1OUT(double4, *) -DECLOP_4VAR_2IN_1OUT(double4, /) - -DECLOP_4VAR_ASSIGN(double4, +=) -DECLOP_4VAR_ASSIGN(double4, -=) -DECLOP_4VAR_ASSIGN(double4, *=) -DECLOP_4VAR_ASSIGN(double4, /=) - -DECLOP_4VAR_PREOP(double4, ++) -DECLOP_4VAR_PREOP(double4, --) - -DECLOP_4VAR_POSTOP(double4, ++) -DECLOP_4VAR_POSTOP(double4, --) - -DECLOP_4VAR_COMP(double4, ==) -DECLOP_4VAR_COMP(double4, !=) -DECLOP_4VAR_COMP(double4, <) -DECLOP_4VAR_COMP(double4, >) -DECLOP_4VAR_COMP(double4, <=) -DECLOP_4VAR_COMP(double4, >=) - -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(double4, float) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(double4, double) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed long long) - -// UNSIGNED LONG1 - -DECLOP_1VAR_2IN_1OUT(ulong1, +) -DECLOP_1VAR_2IN_1OUT(ulong1, -) -DECLOP_1VAR_2IN_1OUT(ulong1, *) -DECLOP_1VAR_2IN_1OUT(ulong1, /) -DECLOP_1VAR_2IN_1OUT(ulong1, %) -DECLOP_1VAR_2IN_1OUT(ulong1, &) -DECLOP_1VAR_2IN_1OUT(ulong1, |) -DECLOP_1VAR_2IN_1OUT(ulong1, ^) -DECLOP_1VAR_2IN_1OUT(ulong1, <<) -DECLOP_1VAR_2IN_1OUT(ulong1, >>) - - -DECLOP_1VAR_ASSIGN(ulong1, +=) -DECLOP_1VAR_ASSIGN(ulong1, -=) -DECLOP_1VAR_ASSIGN(ulong1, *=) -DECLOP_1VAR_ASSIGN(ulong1, /=) -DECLOP_1VAR_ASSIGN(ulong1, %=) -DECLOP_1VAR_ASSIGN(ulong1, &=) -DECLOP_1VAR_ASSIGN(ulong1, |=) -DECLOP_1VAR_ASSIGN(ulong1, ^=) -DECLOP_1VAR_ASSIGN(ulong1, <<=) -DECLOP_1VAR_ASSIGN(ulong1, >>=) - -DECLOP_1VAR_PREOP(ulong1, ++) -DECLOP_1VAR_PREOP(ulong1, --) - -DECLOP_1VAR_POSTOP(ulong1, ++) -DECLOP_1VAR_POSTOP(ulong1, --) - -DECLOP_1VAR_COMP(ulong1, ==) -DECLOP_1VAR_COMP(ulong1, !=) -DECLOP_1VAR_COMP(ulong1, <) -DECLOP_1VAR_COMP(ulong1, >) -DECLOP_1VAR_COMP(ulong1, <=) -DECLOP_1VAR_COMP(ulong1, >=) - -DECLOP_1VAR_COMP(ulong1, &&) -DECLOP_1VAR_COMP(ulong1, ||) - -DECLOP_1VAR_1IN_1OUT(ulong1, ~) -DECLOP_1VAR_1IN_BOOLOUT(ulong1, !) - -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, float) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, double) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long long) - -// UNSIGNED LONG2 - -DECLOP_2VAR_2IN_1OUT(ulong2, +) -DECLOP_2VAR_2IN_1OUT(ulong2, -) -DECLOP_2VAR_2IN_1OUT(ulong2, *) -DECLOP_2VAR_2IN_1OUT(ulong2, /) -DECLOP_2VAR_2IN_1OUT(ulong2, %) -DECLOP_2VAR_2IN_1OUT(ulong2, &) -DECLOP_2VAR_2IN_1OUT(ulong2, |) -DECLOP_2VAR_2IN_1OUT(ulong2, ^) -DECLOP_2VAR_2IN_1OUT(ulong2, <<) -DECLOP_2VAR_2IN_1OUT(ulong2, >>) - -DECLOP_2VAR_ASSIGN(ulong2, +=) -DECLOP_2VAR_ASSIGN(ulong2, -=) -DECLOP_2VAR_ASSIGN(ulong2, *=) -DECLOP_2VAR_ASSIGN(ulong2, /=) -DECLOP_2VAR_ASSIGN(ulong2, %=) -DECLOP_2VAR_ASSIGN(ulong2, &=) -DECLOP_2VAR_ASSIGN(ulong2, |=) -DECLOP_2VAR_ASSIGN(ulong2, ^=) -DECLOP_2VAR_ASSIGN(ulong2, <<=) -DECLOP_2VAR_ASSIGN(ulong2, >>=) - -DECLOP_2VAR_PREOP(ulong2, ++) -DECLOP_2VAR_PREOP(ulong2, --) - -DECLOP_2VAR_POSTOP(ulong2, ++) -DECLOP_2VAR_POSTOP(ulong2, --) - -DECLOP_2VAR_COMP(ulong2, ==) -DECLOP_2VAR_COMP(ulong2, !=) -DECLOP_2VAR_COMP(ulong2, <) -DECLOP_2VAR_COMP(ulong2, >) -DECLOP_2VAR_COMP(ulong2, <=) -DECLOP_2VAR_COMP(ulong2, >=) - -DECLOP_2VAR_COMP(ulong2, &&) -DECLOP_2VAR_COMP(ulong2, ||) - -DECLOP_2VAR_1IN_1OUT(ulong2, ~) -DECLOP_2VAR_1IN_BOOLOUT(ulong2, !) - -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, float) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, double) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long long) - -// UNSIGNED LONG3 - -DECLOP_3VAR_2IN_1OUT(ulong3, +) -DECLOP_3VAR_2IN_1OUT(ulong3, -) -DECLOP_3VAR_2IN_1OUT(ulong3, *) -DECLOP_3VAR_2IN_1OUT(ulong3, /) -DECLOP_3VAR_2IN_1OUT(ulong3, %) -DECLOP_3VAR_2IN_1OUT(ulong3, &) -DECLOP_3VAR_2IN_1OUT(ulong3, |) -DECLOP_3VAR_2IN_1OUT(ulong3, ^) -DECLOP_3VAR_2IN_1OUT(ulong3, <<) -DECLOP_3VAR_2IN_1OUT(ulong3, >>) - -DECLOP_3VAR_ASSIGN(ulong3, +=) -DECLOP_3VAR_ASSIGN(ulong3, -=) -DECLOP_3VAR_ASSIGN(ulong3, *=) -DECLOP_3VAR_ASSIGN(ulong3, /=) -DECLOP_3VAR_ASSIGN(ulong3, %=) -DECLOP_3VAR_ASSIGN(ulong3, &=) -DECLOP_3VAR_ASSIGN(ulong3, |=) -DECLOP_3VAR_ASSIGN(ulong3, ^=) -DECLOP_3VAR_ASSIGN(ulong3, <<=) -DECLOP_3VAR_ASSIGN(ulong3, >>=) - -DECLOP_3VAR_PREOP(ulong3, ++) -DECLOP_3VAR_PREOP(ulong3, --) - -DECLOP_3VAR_POSTOP(ulong3, ++) -DECLOP_3VAR_POSTOP(ulong3, --) - -DECLOP_3VAR_COMP(ulong3, ==) -DECLOP_3VAR_COMP(ulong3, !=) -DECLOP_3VAR_COMP(ulong3, <) -DECLOP_3VAR_COMP(ulong3, >) -DECLOP_3VAR_COMP(ulong3, <=) -DECLOP_3VAR_COMP(ulong3, >=) - -DECLOP_3VAR_COMP(ulong3, &&) -DECLOP_3VAR_COMP(ulong3, ||) - -DECLOP_3VAR_1IN_1OUT(ulong3, ~) -DECLOP_3VAR_1IN_BOOLOUT(ulong3, !) - -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, float) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, double) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long long) - -// UNSIGNED LONG4 - -DECLOP_4VAR_2IN_1OUT(ulong4, +) -DECLOP_4VAR_2IN_1OUT(ulong4, -) -DECLOP_4VAR_2IN_1OUT(ulong4, *) -DECLOP_4VAR_2IN_1OUT(ulong4, /) -DECLOP_4VAR_2IN_1OUT(ulong4, %) -DECLOP_4VAR_2IN_1OUT(ulong4, &) -DECLOP_4VAR_2IN_1OUT(ulong4, |) -DECLOP_4VAR_2IN_1OUT(ulong4, ^) -DECLOP_4VAR_2IN_1OUT(ulong4, <<) -DECLOP_4VAR_2IN_1OUT(ulong4, >>) - -DECLOP_4VAR_ASSIGN(ulong4, +=) -DECLOP_4VAR_ASSIGN(ulong4, -=) -DECLOP_4VAR_ASSIGN(ulong4, *=) -DECLOP_4VAR_ASSIGN(ulong4, /=) -DECLOP_4VAR_ASSIGN(ulong4, %=) -DECLOP_4VAR_ASSIGN(ulong4, &=) -DECLOP_4VAR_ASSIGN(ulong4, |=) -DECLOP_4VAR_ASSIGN(ulong4, ^=) -DECLOP_4VAR_ASSIGN(ulong4, <<=) -DECLOP_4VAR_ASSIGN(ulong4, >>=) - -DECLOP_4VAR_PREOP(ulong4, ++) -DECLOP_4VAR_PREOP(ulong4, --) - -DECLOP_4VAR_POSTOP(ulong4, ++) -DECLOP_4VAR_POSTOP(ulong4, --) - -DECLOP_4VAR_COMP(ulong4, ==) -DECLOP_4VAR_COMP(ulong4, !=) -DECLOP_4VAR_COMP(ulong4, <) -DECLOP_4VAR_COMP(ulong4, >) -DECLOP_4VAR_COMP(ulong4, <=) -DECLOP_4VAR_COMP(ulong4, >=) - -DECLOP_4VAR_COMP(ulong4, &&) -DECLOP_4VAR_COMP(ulong4, ||) - -DECLOP_4VAR_1IN_1OUT(ulong4, ~) -DECLOP_4VAR_1IN_BOOLOUT(ulong4, !) - -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, float) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, double) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long long) - -// SIGNED LONG1 - -DECLOP_1VAR_2IN_1OUT(long1, +) -DECLOP_1VAR_2IN_1OUT(long1, -) -DECLOP_1VAR_2IN_1OUT(long1, *) -DECLOP_1VAR_2IN_1OUT(long1, /) -DECLOP_1VAR_2IN_1OUT(long1, %) -DECLOP_1VAR_2IN_1OUT(long1, &) -DECLOP_1VAR_2IN_1OUT(long1, |) -DECLOP_1VAR_2IN_1OUT(long1, ^) -DECLOP_1VAR_2IN_1OUT(long1, <<) -DECLOP_1VAR_2IN_1OUT(long1, >>) - - -DECLOP_1VAR_ASSIGN(long1, +=) -DECLOP_1VAR_ASSIGN(long1, -=) -DECLOP_1VAR_ASSIGN(long1, *=) -DECLOP_1VAR_ASSIGN(long1, /=) -DECLOP_1VAR_ASSIGN(long1, %=) -DECLOP_1VAR_ASSIGN(long1, &=) -DECLOP_1VAR_ASSIGN(long1, |=) -DECLOP_1VAR_ASSIGN(long1, ^=) -DECLOP_1VAR_ASSIGN(long1, <<=) -DECLOP_1VAR_ASSIGN(long1, >>=) - -DECLOP_1VAR_PREOP(long1, ++) -DECLOP_1VAR_PREOP(long1, --) - -DECLOP_1VAR_POSTOP(long1, ++) -DECLOP_1VAR_POSTOP(long1, --) - -DECLOP_1VAR_COMP(long1, ==) -DECLOP_1VAR_COMP(long1, !=) -DECLOP_1VAR_COMP(long1, <) -DECLOP_1VAR_COMP(long1, >) -DECLOP_1VAR_COMP(long1, <=) -DECLOP_1VAR_COMP(long1, >=) - -DECLOP_1VAR_COMP(long1, &&) -DECLOP_1VAR_COMP(long1, ||) - -DECLOP_1VAR_1IN_1OUT(long1, ~) -DECLOP_1VAR_1IN_BOOLOUT(long1, !) - -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(long1, float) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(long1, double) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed long long) - -// SIGNED LONG2 - -DECLOP_2VAR_2IN_1OUT(long2, +) -DECLOP_2VAR_2IN_1OUT(long2, -) -DECLOP_2VAR_2IN_1OUT(long2, *) -DECLOP_2VAR_2IN_1OUT(long2, /) -DECLOP_2VAR_2IN_1OUT(long2, %) -DECLOP_2VAR_2IN_1OUT(long2, &) -DECLOP_2VAR_2IN_1OUT(long2, |) -DECLOP_2VAR_2IN_1OUT(long2, ^) -DECLOP_2VAR_2IN_1OUT(long2, <<) -DECLOP_2VAR_2IN_1OUT(long2, >>) - -DECLOP_2VAR_ASSIGN(long2, +=) -DECLOP_2VAR_ASSIGN(long2, -=) -DECLOP_2VAR_ASSIGN(long2, *=) -DECLOP_2VAR_ASSIGN(long2, /=) -DECLOP_2VAR_ASSIGN(long2, %=) -DECLOP_2VAR_ASSIGN(long2, &=) -DECLOP_2VAR_ASSIGN(long2, |=) -DECLOP_2VAR_ASSIGN(long2, ^=) -DECLOP_2VAR_ASSIGN(long2, <<=) -DECLOP_2VAR_ASSIGN(long2, >>=) - -DECLOP_2VAR_PREOP(long2, ++) -DECLOP_2VAR_PREOP(long2, --) - -DECLOP_2VAR_POSTOP(long2, ++) -DECLOP_2VAR_POSTOP(long2, --) - -DECLOP_2VAR_COMP(long2, ==) -DECLOP_2VAR_COMP(long2, !=) -DECLOP_2VAR_COMP(long2, <) -DECLOP_2VAR_COMP(long2, >) -DECLOP_2VAR_COMP(long2, <=) -DECLOP_2VAR_COMP(long2, >=) - -DECLOP_2VAR_COMP(long2, &&) -DECLOP_2VAR_COMP(long2, ||) - -DECLOP_2VAR_1IN_1OUT(long2, ~) -DECLOP_2VAR_1IN_BOOLOUT(long2, !) - -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(long2, float) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(long2, double) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed long long) - -// SIGNED LONG3 - -DECLOP_3VAR_2IN_1OUT(long3, +) -DECLOP_3VAR_2IN_1OUT(long3, -) -DECLOP_3VAR_2IN_1OUT(long3, *) -DECLOP_3VAR_2IN_1OUT(long3, /) -DECLOP_3VAR_2IN_1OUT(long3, %) -DECLOP_3VAR_2IN_1OUT(long3, &) -DECLOP_3VAR_2IN_1OUT(long3, |) -DECLOP_3VAR_2IN_1OUT(long3, ^) -DECLOP_3VAR_2IN_1OUT(long3, <<) -DECLOP_3VAR_2IN_1OUT(long3, >>) - -DECLOP_3VAR_ASSIGN(long3, +=) -DECLOP_3VAR_ASSIGN(long3, -=) -DECLOP_3VAR_ASSIGN(long3, *=) -DECLOP_3VAR_ASSIGN(long3, /=) -DECLOP_3VAR_ASSIGN(long3, %=) -DECLOP_3VAR_ASSIGN(long3, &=) -DECLOP_3VAR_ASSIGN(long3, |=) -DECLOP_3VAR_ASSIGN(long3, ^=) -DECLOP_3VAR_ASSIGN(long3, <<=) -DECLOP_3VAR_ASSIGN(long3, >>=) - -DECLOP_3VAR_PREOP(long3, ++) -DECLOP_3VAR_PREOP(long3, --) - -DECLOP_3VAR_POSTOP(long3, ++) -DECLOP_3VAR_POSTOP(long3, --) - -DECLOP_3VAR_COMP(long3, ==) -DECLOP_3VAR_COMP(long3, !=) -DECLOP_3VAR_COMP(long3, <) -DECLOP_3VAR_COMP(long3, >) -DECLOP_3VAR_COMP(long3, <=) -DECLOP_3VAR_COMP(long3, >=) - -DECLOP_3VAR_COMP(long3, &&) -DECLOP_3VAR_COMP(long3, ||) - -DECLOP_3VAR_1IN_1OUT(long3, ~) -DECLOP_3VAR_1IN_BOOLOUT(long3, !) - -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(long3, float) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(long3, double) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed long long) - -// SIGNED LONG4 - -DECLOP_4VAR_2IN_1OUT(long4, +) -DECLOP_4VAR_2IN_1OUT(long4, -) -DECLOP_4VAR_2IN_1OUT(long4, *) -DECLOP_4VAR_2IN_1OUT(long4, /) -DECLOP_4VAR_2IN_1OUT(long4, %) -DECLOP_4VAR_2IN_1OUT(long4, &) -DECLOP_4VAR_2IN_1OUT(long4, |) -DECLOP_4VAR_2IN_1OUT(long4, ^) -DECLOP_4VAR_2IN_1OUT(long4, <<) -DECLOP_4VAR_2IN_1OUT(long4, >>) - -DECLOP_4VAR_ASSIGN(long4, +=) -DECLOP_4VAR_ASSIGN(long4, -=) -DECLOP_4VAR_ASSIGN(long4, *=) -DECLOP_4VAR_ASSIGN(long4, /=) -DECLOP_4VAR_ASSIGN(long4, %=) -DECLOP_4VAR_ASSIGN(long4, &=) -DECLOP_4VAR_ASSIGN(long4, |=) -DECLOP_4VAR_ASSIGN(long4, ^=) -DECLOP_4VAR_ASSIGN(long4, <<=) -DECLOP_4VAR_ASSIGN(long4, >>=) - -DECLOP_4VAR_PREOP(long4, ++) -DECLOP_4VAR_PREOP(long4, --) - -DECLOP_4VAR_POSTOP(long4, ++) -DECLOP_4VAR_POSTOP(long4, --) - -DECLOP_4VAR_COMP(long4, ==) -DECLOP_4VAR_COMP(long4, !=) -DECLOP_4VAR_COMP(long4, <) -DECLOP_4VAR_COMP(long4, >) -DECLOP_4VAR_COMP(long4, <=) -DECLOP_4VAR_COMP(long4, >=) - -DECLOP_4VAR_COMP(long4, &&) -DECLOP_4VAR_COMP(long4, ||) - -DECLOP_4VAR_1IN_1OUT(long4, ~) -DECLOP_4VAR_1IN_BOOLOUT(long4, !) - -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(long4, float) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(long4, double) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed long long) - -// UNSIGNED LONGLONG1 - -DECLOP_1VAR_2IN_1OUT(ulonglong1, +) -DECLOP_1VAR_2IN_1OUT(ulonglong1, -) -DECLOP_1VAR_2IN_1OUT(ulonglong1, *) -DECLOP_1VAR_2IN_1OUT(ulonglong1, /) -DECLOP_1VAR_2IN_1OUT(ulonglong1, %) -DECLOP_1VAR_2IN_1OUT(ulonglong1, &) -DECLOP_1VAR_2IN_1OUT(ulonglong1, |) -DECLOP_1VAR_2IN_1OUT(ulonglong1, ^) -DECLOP_1VAR_2IN_1OUT(ulonglong1, <<) -DECLOP_1VAR_2IN_1OUT(ulonglong1, >>) - - -DECLOP_1VAR_ASSIGN(ulonglong1, +=) -DECLOP_1VAR_ASSIGN(ulonglong1, -=) -DECLOP_1VAR_ASSIGN(ulonglong1, *=) -DECLOP_1VAR_ASSIGN(ulonglong1, /=) -DECLOP_1VAR_ASSIGN(ulonglong1, %=) -DECLOP_1VAR_ASSIGN(ulonglong1, &=) -DECLOP_1VAR_ASSIGN(ulonglong1, |=) -DECLOP_1VAR_ASSIGN(ulonglong1, ^=) -DECLOP_1VAR_ASSIGN(ulonglong1, <<=) -DECLOP_1VAR_ASSIGN(ulonglong1, >>=) - -DECLOP_1VAR_PREOP(ulonglong1, ++) -DECLOP_1VAR_PREOP(ulonglong1, --) - -DECLOP_1VAR_POSTOP(ulonglong1, ++) -DECLOP_1VAR_POSTOP(ulonglong1, --) - -DECLOP_1VAR_COMP(ulonglong1, ==) -DECLOP_1VAR_COMP(ulonglong1, !=) -DECLOP_1VAR_COMP(ulonglong1, <) -DECLOP_1VAR_COMP(ulonglong1, >) -DECLOP_1VAR_COMP(ulonglong1, <=) -DECLOP_1VAR_COMP(ulonglong1, >=) - -DECLOP_1VAR_COMP(ulonglong1, &&) -DECLOP_1VAR_COMP(ulonglong1, ||) - -DECLOP_1VAR_1IN_1OUT(ulonglong1, ~) -DECLOP_1VAR_1IN_BOOLOUT(ulonglong1, !) - -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, float) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, double) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long long) - -// UNSIGNED LONGLONG2 - -DECLOP_2VAR_2IN_1OUT(ulonglong2, +) -DECLOP_2VAR_2IN_1OUT(ulonglong2, -) -DECLOP_2VAR_2IN_1OUT(ulonglong2, *) -DECLOP_2VAR_2IN_1OUT(ulonglong2, /) -DECLOP_2VAR_2IN_1OUT(ulonglong2, %) -DECLOP_2VAR_2IN_1OUT(ulonglong2, &) -DECLOP_2VAR_2IN_1OUT(ulonglong2, |) -DECLOP_2VAR_2IN_1OUT(ulonglong2, ^) -DECLOP_2VAR_2IN_1OUT(ulonglong2, <<) -DECLOP_2VAR_2IN_1OUT(ulonglong2, >>) - -DECLOP_2VAR_ASSIGN(ulonglong2, +=) -DECLOP_2VAR_ASSIGN(ulonglong2, -=) -DECLOP_2VAR_ASSIGN(ulonglong2, *=) -DECLOP_2VAR_ASSIGN(ulonglong2, /=) -DECLOP_2VAR_ASSIGN(ulonglong2, %=) -DECLOP_2VAR_ASSIGN(ulonglong2, &=) -DECLOP_2VAR_ASSIGN(ulonglong2, |=) -DECLOP_2VAR_ASSIGN(ulonglong2, ^=) -DECLOP_2VAR_ASSIGN(ulonglong2, <<=) -DECLOP_2VAR_ASSIGN(ulonglong2, >>=) - -DECLOP_2VAR_PREOP(ulonglong2, ++) -DECLOP_2VAR_PREOP(ulonglong2, --) - -DECLOP_2VAR_POSTOP(ulonglong2, ++) -DECLOP_2VAR_POSTOP(ulonglong2, --) - -DECLOP_2VAR_COMP(ulonglong2, ==) -DECLOP_2VAR_COMP(ulonglong2, !=) -DECLOP_2VAR_COMP(ulonglong2, <) -DECLOP_2VAR_COMP(ulonglong2, >) -DECLOP_2VAR_COMP(ulonglong2, <=) -DECLOP_2VAR_COMP(ulonglong2, >=) - -DECLOP_2VAR_COMP(ulonglong2, &&) -DECLOP_2VAR_COMP(ulonglong2, ||) - -DECLOP_2VAR_1IN_1OUT(ulonglong2, ~) -DECLOP_2VAR_1IN_BOOLOUT(ulonglong2, !) - -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, float) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, double) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long long) - -// UNSIGNED LONGLONG3 - -DECLOP_3VAR_2IN_1OUT(ulonglong3, +) -DECLOP_3VAR_2IN_1OUT(ulonglong3, -) -DECLOP_3VAR_2IN_1OUT(ulonglong3, *) -DECLOP_3VAR_2IN_1OUT(ulonglong3, /) -DECLOP_3VAR_2IN_1OUT(ulonglong3, %) -DECLOP_3VAR_2IN_1OUT(ulonglong3, &) -DECLOP_3VAR_2IN_1OUT(ulonglong3, |) -DECLOP_3VAR_2IN_1OUT(ulonglong3, ^) -DECLOP_3VAR_2IN_1OUT(ulonglong3, <<) -DECLOP_3VAR_2IN_1OUT(ulonglong3, >>) - -DECLOP_3VAR_ASSIGN(ulonglong3, +=) -DECLOP_3VAR_ASSIGN(ulonglong3, -=) -DECLOP_3VAR_ASSIGN(ulonglong3, *=) -DECLOP_3VAR_ASSIGN(ulonglong3, /=) -DECLOP_3VAR_ASSIGN(ulonglong3, %=) -DECLOP_3VAR_ASSIGN(ulonglong3, &=) -DECLOP_3VAR_ASSIGN(ulonglong3, |=) -DECLOP_3VAR_ASSIGN(ulonglong3, ^=) -DECLOP_3VAR_ASSIGN(ulonglong3, <<=) -DECLOP_3VAR_ASSIGN(ulonglong3, >>=) - -DECLOP_3VAR_PREOP(ulonglong3, ++) -DECLOP_3VAR_PREOP(ulonglong3, --) - -DECLOP_3VAR_POSTOP(ulonglong3, ++) -DECLOP_3VAR_POSTOP(ulonglong3, --) - -DECLOP_3VAR_COMP(ulonglong3, ==) -DECLOP_3VAR_COMP(ulonglong3, !=) -DECLOP_3VAR_COMP(ulonglong3, <) -DECLOP_3VAR_COMP(ulonglong3, >) -DECLOP_3VAR_COMP(ulonglong3, <=) -DECLOP_3VAR_COMP(ulonglong3, >=) - -DECLOP_3VAR_COMP(ulonglong3, &&) -DECLOP_3VAR_COMP(ulonglong3, ||) - -DECLOP_3VAR_1IN_1OUT(ulonglong3, ~) -DECLOP_3VAR_1IN_BOOLOUT(ulonglong3, !) - -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, float) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, double) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long long) - -// UNSIGNED LONGLONG4 - -DECLOP_4VAR_2IN_1OUT(ulonglong4, +) -DECLOP_4VAR_2IN_1OUT(ulonglong4, -) -DECLOP_4VAR_2IN_1OUT(ulonglong4, *) -DECLOP_4VAR_2IN_1OUT(ulonglong4, /) -DECLOP_4VAR_2IN_1OUT(ulonglong4, %) -DECLOP_4VAR_2IN_1OUT(ulonglong4, &) -DECLOP_4VAR_2IN_1OUT(ulonglong4, |) -DECLOP_4VAR_2IN_1OUT(ulonglong4, ^) -DECLOP_4VAR_2IN_1OUT(ulonglong4, <<) -DECLOP_4VAR_2IN_1OUT(ulonglong4, >>) - -DECLOP_4VAR_ASSIGN(ulonglong4, +=) -DECLOP_4VAR_ASSIGN(ulonglong4, -=) -DECLOP_4VAR_ASSIGN(ulonglong4, *=) -DECLOP_4VAR_ASSIGN(ulonglong4, /=) -DECLOP_4VAR_ASSIGN(ulonglong4, %=) -DECLOP_4VAR_ASSIGN(ulonglong4, &=) -DECLOP_4VAR_ASSIGN(ulonglong4, |=) -DECLOP_4VAR_ASSIGN(ulonglong4, ^=) -DECLOP_4VAR_ASSIGN(ulonglong4, <<=) -DECLOP_4VAR_ASSIGN(ulonglong4, >>=) - -DECLOP_4VAR_PREOP(ulonglong4, ++) -DECLOP_4VAR_PREOP(ulonglong4, --) - -DECLOP_4VAR_POSTOP(ulonglong4, ++) -DECLOP_4VAR_POSTOP(ulonglong4, --) - -DECLOP_4VAR_COMP(ulonglong4, ==) -DECLOP_4VAR_COMP(ulonglong4, !=) -DECLOP_4VAR_COMP(ulonglong4, <) -DECLOP_4VAR_COMP(ulonglong4, >) -DECLOP_4VAR_COMP(ulonglong4, <=) -DECLOP_4VAR_COMP(ulonglong4, >=) - -DECLOP_4VAR_COMP(ulonglong4, &&) -DECLOP_4VAR_COMP(ulonglong4, ||) - -DECLOP_4VAR_1IN_1OUT(ulonglong4, ~) -DECLOP_4VAR_1IN_BOOLOUT(ulonglong4, !) - -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, float) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, double) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long long) - -// SIGNED LONGLONG1 - -DECLOP_1VAR_2IN_1OUT(longlong1, +) -DECLOP_1VAR_2IN_1OUT(longlong1, -) -DECLOP_1VAR_2IN_1OUT(longlong1, *) -DECLOP_1VAR_2IN_1OUT(longlong1, /) -DECLOP_1VAR_2IN_1OUT(longlong1, %) -DECLOP_1VAR_2IN_1OUT(longlong1, &) -DECLOP_1VAR_2IN_1OUT(longlong1, |) -DECLOP_1VAR_2IN_1OUT(longlong1, ^) -DECLOP_1VAR_2IN_1OUT(longlong1, <<) -DECLOP_1VAR_2IN_1OUT(longlong1, >>) - - -DECLOP_1VAR_ASSIGN(longlong1, +=) -DECLOP_1VAR_ASSIGN(longlong1, -=) -DECLOP_1VAR_ASSIGN(longlong1, *=) -DECLOP_1VAR_ASSIGN(longlong1, /=) -DECLOP_1VAR_ASSIGN(longlong1, %=) -DECLOP_1VAR_ASSIGN(longlong1, &=) -DECLOP_1VAR_ASSIGN(longlong1, |=) -DECLOP_1VAR_ASSIGN(longlong1, ^=) -DECLOP_1VAR_ASSIGN(longlong1, <<=) -DECLOP_1VAR_ASSIGN(longlong1, >>=) - -DECLOP_1VAR_PREOP(longlong1, ++) -DECLOP_1VAR_PREOP(longlong1, --) - -DECLOP_1VAR_POSTOP(longlong1, ++) -DECLOP_1VAR_POSTOP(longlong1, --) - -DECLOP_1VAR_COMP(longlong1, ==) -DECLOP_1VAR_COMP(longlong1, !=) -DECLOP_1VAR_COMP(longlong1, <) -DECLOP_1VAR_COMP(longlong1, >) -DECLOP_1VAR_COMP(longlong1, <=) -DECLOP_1VAR_COMP(longlong1, >=) - -DECLOP_1VAR_COMP(longlong1, &&) -DECLOP_1VAR_COMP(longlong1, ||) - -DECLOP_1VAR_1IN_1OUT(longlong1, ~) -DECLOP_1VAR_1IN_BOOLOUT(longlong1, !) - -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, float) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, double) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long long) - -// SIGNED LONGLONG2 - -DECLOP_2VAR_2IN_1OUT(longlong2, +) -DECLOP_2VAR_2IN_1OUT(longlong2, -) -DECLOP_2VAR_2IN_1OUT(longlong2, *) -DECLOP_2VAR_2IN_1OUT(longlong2, /) -DECLOP_2VAR_2IN_1OUT(longlong2, %) -DECLOP_2VAR_2IN_1OUT(longlong2, &) -DECLOP_2VAR_2IN_1OUT(longlong2, |) -DECLOP_2VAR_2IN_1OUT(longlong2, ^) -DECLOP_2VAR_2IN_1OUT(longlong2, <<) -DECLOP_2VAR_2IN_1OUT(longlong2, >>) - -DECLOP_2VAR_ASSIGN(longlong2, +=) -DECLOP_2VAR_ASSIGN(longlong2, -=) -DECLOP_2VAR_ASSIGN(longlong2, *=) -DECLOP_2VAR_ASSIGN(longlong2, /=) -DECLOP_2VAR_ASSIGN(longlong2, %=) -DECLOP_2VAR_ASSIGN(longlong2, &=) -DECLOP_2VAR_ASSIGN(longlong2, |=) -DECLOP_2VAR_ASSIGN(longlong2, ^=) -DECLOP_2VAR_ASSIGN(longlong2, <<=) -DECLOP_2VAR_ASSIGN(longlong2, >>=) - -DECLOP_2VAR_PREOP(longlong2, ++) -DECLOP_2VAR_PREOP(longlong2, --) - -DECLOP_2VAR_POSTOP(longlong2, ++) -DECLOP_2VAR_POSTOP(longlong2, --) - -DECLOP_2VAR_COMP(longlong2, ==) -DECLOP_2VAR_COMP(longlong2, !=) -DECLOP_2VAR_COMP(longlong2, <) -DECLOP_2VAR_COMP(longlong2, >) -DECLOP_2VAR_COMP(longlong2, <=) -DECLOP_2VAR_COMP(longlong2, >=) - -DECLOP_2VAR_COMP(longlong2, &&) -DECLOP_2VAR_COMP(longlong2, ||) - -DECLOP_2VAR_1IN_1OUT(longlong2, ~) -DECLOP_2VAR_1IN_BOOLOUT(longlong2, !) - -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, float) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, double) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long long) - -// SIGNED LONGLONG3 - -DECLOP_3VAR_2IN_1OUT(longlong3, +) -DECLOP_3VAR_2IN_1OUT(longlong3, -) -DECLOP_3VAR_2IN_1OUT(longlong3, *) -DECLOP_3VAR_2IN_1OUT(longlong3, /) -DECLOP_3VAR_2IN_1OUT(longlong3, %) -DECLOP_3VAR_2IN_1OUT(longlong3, &) -DECLOP_3VAR_2IN_1OUT(longlong3, |) -DECLOP_3VAR_2IN_1OUT(longlong3, ^) -DECLOP_3VAR_2IN_1OUT(longlong3, <<) -DECLOP_3VAR_2IN_1OUT(longlong3, >>) - -DECLOP_3VAR_ASSIGN(longlong3, +=) -DECLOP_3VAR_ASSIGN(longlong3, -=) -DECLOP_3VAR_ASSIGN(longlong3, *=) -DECLOP_3VAR_ASSIGN(longlong3, /=) -DECLOP_3VAR_ASSIGN(longlong3, %=) -DECLOP_3VAR_ASSIGN(longlong3, &=) -DECLOP_3VAR_ASSIGN(longlong3, |=) -DECLOP_3VAR_ASSIGN(longlong3, ^=) -DECLOP_3VAR_ASSIGN(longlong3, <<=) -DECLOP_3VAR_ASSIGN(longlong3, >>=) - -DECLOP_3VAR_PREOP(longlong3, ++) -DECLOP_3VAR_PREOP(longlong3, --) - -DECLOP_3VAR_POSTOP(longlong3, ++) -DECLOP_3VAR_POSTOP(longlong3, --) - -DECLOP_3VAR_COMP(longlong3, ==) -DECLOP_3VAR_COMP(longlong3, !=) -DECLOP_3VAR_COMP(longlong3, <) -DECLOP_3VAR_COMP(longlong3, >) -DECLOP_3VAR_COMP(longlong3, <=) -DECLOP_3VAR_COMP(longlong3, >=) - -DECLOP_3VAR_COMP(longlong3, &&) -DECLOP_3VAR_COMP(longlong3, ||) - -DECLOP_3VAR_1IN_1OUT(longlong3, ~) -DECLOP_3VAR_1IN_BOOLOUT(longlong3, !) - -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, float) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, double) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long long) - -// SIGNED LONGLONG4 - -DECLOP_4VAR_2IN_1OUT(longlong4, +) -DECLOP_4VAR_2IN_1OUT(longlong4, -) -DECLOP_4VAR_2IN_1OUT(longlong4, *) -DECLOP_4VAR_2IN_1OUT(longlong4, /) -DECLOP_4VAR_2IN_1OUT(longlong4, %) -DECLOP_4VAR_2IN_1OUT(longlong4, &) -DECLOP_4VAR_2IN_1OUT(longlong4, |) -DECLOP_4VAR_2IN_1OUT(longlong4, ^) -DECLOP_4VAR_2IN_1OUT(longlong4, <<) -DECLOP_4VAR_2IN_1OUT(longlong4, >>) - -DECLOP_4VAR_ASSIGN(longlong4, +=) -DECLOP_4VAR_ASSIGN(longlong4, -=) -DECLOP_4VAR_ASSIGN(longlong4, *=) -DECLOP_4VAR_ASSIGN(longlong4, /=) -DECLOP_4VAR_ASSIGN(longlong4, %=) -DECLOP_4VAR_ASSIGN(longlong4, &=) -DECLOP_4VAR_ASSIGN(longlong4, |=) -DECLOP_4VAR_ASSIGN(longlong4, ^=) -DECLOP_4VAR_ASSIGN(longlong4, <<=) -DECLOP_4VAR_ASSIGN(longlong4, >>=) - -DECLOP_4VAR_PREOP(longlong4, ++) -DECLOP_4VAR_PREOP(longlong4, --) - -DECLOP_4VAR_POSTOP(longlong4, ++) -DECLOP_4VAR_POSTOP(longlong4, --) - -DECLOP_4VAR_COMP(longlong4, ==) -DECLOP_4VAR_COMP(longlong4, !=) -DECLOP_4VAR_COMP(longlong4, <) -DECLOP_4VAR_COMP(longlong4, >) -DECLOP_4VAR_COMP(longlong4, <=) -DECLOP_4VAR_COMP(longlong4, >=) - -DECLOP_4VAR_COMP(longlong4, &&) -DECLOP_4VAR_COMP(longlong4, ||) - -DECLOP_4VAR_1IN_1OUT(longlong4, ~) -DECLOP_4VAR_1IN_BOOLOUT(longlong4, !) - -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, float) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, double) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long long) #endif + #endif diff --git a/src/hip_fp16.cpp b/src/hip_fp16.cpp index c2b7b47597..8e8f003f56 100644 --- a/src/hip_fp16.cpp +++ b/src/hip_fp16.cpp @@ -90,11 +90,11 @@ __device__ bool __hgt(__half a, __half b) { } __device__ bool __hisinf(__half a) { - return a == __hInfValue.h ? true : false; + return a == HINF ? true : false; } __device__ bool __hisnan(__half a) { - return a > __hInfValue.h ? true : false; + return a > HINF ? true : false; } __device__ bool __hle(__half a, __half b) { @@ -114,75 +114,75 @@ Half2 Comparision Functions */ __device__ bool __hbeq2(__half2 a, __half2 b) { - return (a.p[0] == b.p[0] ? true : false) && (a.p[1] == b.p[1] ? true : false); + return (a.x == b.x ? true : false) && (a.y == b.y ? true : false); } __device__ bool __hbge2(__half2 a, __half2 b) { - return (a.p[0] >= b.p[0] ? true : false) && (a.p[1] >= b.p[1] ? true : false); + return (a.x >= b.x ? true : false) && (a.y >= b.y ? true : false); } __device__ bool __hbgt2(__half2 a, __half2 b) { - return (a.p[0] > b.p[0] ? true : false) && (a.p[1] > b.p[1] ? true : false); + return (a.x > b.x ? true : false) && (a.y > b.y ? true : false); } __device__ bool __hble2(__half2 a, __half2 b) { - return (a.p[0] <= b.p[0] ? true : false) && (a.p[1] <= b.p[1] ? true : false); + return (a.x <= b.x ? true : false) && (a.y <= b.y ? true : false); } __device__ bool __hblt2(__half2 a, __half2 b) { - return (a.p[0] < b.p[0] ? true : false) && (a.p[1] < b.p[1] ? true : false); + return (a.x < b.x ? true : false) && (a.y < b.y ? true : false); } __device__ bool __hbne2(__half2 a, __half2 b) { - return (a.p[0] != b.p[0] ? true : false) && (a.p[1] != b.p[1] ? true : false); + return (a.x != b.x ? true : false) && (a.y != b.y ? true : false); } __device__ __half2 __heq2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] == b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] == b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x == b.x) ? (__half)1 : (__half)0; + c.y = (a.y == b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hge2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] >= b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] >= b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x >= b.x) ? (__half)1 : (__half)0; + c.y = (a.y >= b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hgt2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] > b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] > b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x > b.x) ? (__half)1 : (__half)0; + c.y = (a.y > b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hisnan2(__half2 a) { __half2 c; - c.p[0] = (a.p[0] > __hInfValue.h) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] > __hInfValue.h) ? (__half)1 : (__half)0; + c.x = (a.x > HINF) ? (__half)1 : (__half)0; + c.y = (a.y > HINF) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hle2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] <= b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] <= b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x <= b.x) ? (__half)1 : (__half)0; + c.y = (a.y <= b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hlt2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] < b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] < b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x < b.x) ? (__half)1 : (__half)0; + c.y = (a.y < b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hne2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] != b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] != b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x != b.x) ? (__half)1 : (__half)0; + c.y = (a.y != b.y) ? (__half)1 : (__half)0; return c; } @@ -191,8 +191,8 @@ Conversion instructions */ __device__ __half2 __float22half2_rn(const float2 a) { __half2 b; - b.p[0] = (__half)a.x; - b.p[1] = (__half)a.y; + b.x = (__half)a.x; + b.y = (__half)a.y; return b; } @@ -202,8 +202,8 @@ __device__ __half __float2half(const float a) { __device__ __half2 __float2half2_rn(const float a) { __half2 b; - b.p[0] = (__half)a; - b.p[1] = (__half)a; + b.x = (__half)a; + b.y = (__half)a; return b; } @@ -225,15 +225,15 @@ __device__ __half __float2half_rz(const float a) { __device__ __half2 __floats2half2_rn(const float a, const float b) { __half2 c; - c.p[0] = (__half)a; - c.p[1] = (__half)b; + c.x = (__half)a; + c.y = (__half)b; return c; } __device__ float2 __half22float2(const __half2 a) { float2 b; - b.x = (float)a.p[0]; - b.y = (float)a.p[1]; + b.x = (float)a.x; + b.y = (float)a.y; return b; } @@ -243,8 +243,8 @@ __device__ float __half2float(const __half a) { __device__ __half2 half2half2(const __half a) { __half2 b; - b.p[0] = a; - b.p[1] = a; + b.x = a; + b.y = a; return b; } @@ -358,30 +358,30 @@ __device__ unsigned short int __half_as_ushort(const __half h) { __device__ __half2 __halves2half2(const __half a, const __half b) { __half2 c; - c.p[0] = a; - c.p[1] = b; + c.x = a; + c.y = b; return c; } __device__ float __high2float(const __half2 a) { - return (float)a.p[1]; + return (float)a.y; } __device__ __half __high2half(const __half2 a) { - return a.p[1]; + return a.y; } __device__ __half2 __high2half2(const __half2 a) { __half2 b; - b.p[0] = a.p[1]; - b.p[1] = a.p[1]; + b.x = a.y; + b.y = a.y; return b; } __device__ __half2 __highs2half2(const __half2 a, const __half2 b) { __half2 c; - c.p[0] = a.p[1]; - c.p[1] = b.p[1]; + c.x = a.y; + c.y = b.y; return c; } @@ -418,38 +418,38 @@ __device__ __half __ll2half_rz(long long int i){ } __device__ float __low2float(const __half2 a) { - return (float)a.p[0]; + return (float)a.x; } __device__ __half __low2half(const __half2 a) { - return a.p[0]; + return a.x; } __device__ __half2 __low2half2(const __half2 a, const __half2 b) { __half2 c; - c.p[0] = a.p[0]; - c.p[1] = b.p[0]; + c.x = a.x; + c.y = b.x; return c; } __device__ __half2 __low2half2(const __half2 a) { __half2 b; - b.p[0] = a.p[0]; - b.p[1] = a.p[0]; + b.x = a.x; + b.y = a.x; return b; } __device__ __half2 __lowhigh2highlow(const __half2 a) { __half2 b; - b.p[0] = a.p[1]; - b.p[1] = a.p[0]; + b.x = a.y; + b.y = a.x; return b; } __device__ __half2 __lows2half2(const __half2 a, const __half2 b) { __half2 c; - c.p[0] = a.p[0]; - c.p[1] = b.p[0]; + c.y = a.x; + c.y = b.x; return c; } @@ -542,346 +542,4 @@ typedef struct{ }; } struct_float; -#if __clang_major__ == 3 -static __device__ float cvt_half_to_float(__half a){ - struct_float ret = {0}; - if(a.x == 0){ - return 0.0f; - } - if(a.x == 0x8000){ - return -0.0f; - } - ret.u = ((a.x&0x8000)<<16) | (((a.x&0x7c00)+0x1C000)<<13) | ((a.x&0x03FF)<<13); - return ret.f; -} - -static __device__ __half cvt_float_to_half(float b){ - struct_float f = {0}; - __half ret = {0}; - f.f = b; - if(f.f == 0.0f){ - ret.x = 0; - return ret; - } - if(f.f == -0.0f){ - ret.x = 0x8000; - return ret; - } - ret.x = ((f.u>>16)&0x8000)|((((f.u&0x7f800000)-0x38000000)>>13)&0x7c00)|((f.u>>13)&0x03ff); - return ret; -} - - -__device__ __half __soft_hadd(const __half a, const __half b){ - return cvt_float_to_half(cvt_half_to_float(a)+cvt_half_to_float(b)); -} - -__device__ __half __soft_hadd_sat(const __half a, const __half b){ - float f = cvt_half_to_float(a) + cvt_half_to_float(b); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - -__device__ __half __soft_hfma(const __half a, const __half b, const __half c){ - return cvt_float_to_half(fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c))); -} - -__device__ __half __soft_hfma_sat(const __half a, const __half b, const __half c){ - float f = fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c)); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - -__device__ __half __soft_hmul(const __half a, const __half b){ - return cvt_float_to_half(cvt_half_to_float(a)*cvt_half_to_float(b)); -} - -__device__ __half __soft_hmul_sat(const __half a, const __half b){ - float f = cvt_half_to_float(a) * cvt_half_to_float(b); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - -__device__ __half __soft_hneq(const __half a){ - __half ret = {a.x}; - ret.x ^= 1 << 15; - return ret; -} - -__device__ __half __soft_hsub(const __half a, const __half b){ - return cvt_float_to_half(cvt_half_to_float(a)-cvt_half_to_float(b)); -} - -__device__ __half __soft_hsub_sat(const __half a, const __half b){ - float f = cvt_half_to_float(a) - cvt_half_to_float(b); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - - -/* -Half2 Arithmetic Instructions -*/ - -__device__ __half2 __soft_hadd2(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hadd(a.p[1], b.p[1]); - ret.p[0] = __soft_hadd(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hadd2_sat(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hadd_sat(a.p[1], b.p[1]); - ret.p[0] = __soft_hadd_sat(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hfma2(const __half2 a, const __half2 b, const __half2 c){ - __half2 ret; - ret.p[1] = __soft_hfma(a.p[1], b.p[1], c.p[1]); - ret.p[0] = __soft_hfma(a.p[0], b.p[0], c.p[0]); - return ret; -} - -__device__ __half2 __soft_hfma2_sat(const __half2 a, const __half2 b, const __half2 c){ - __half2 ret; - ret.p[1] = __soft_hfma_sat(a.p[1], b.p[1], c.p[1]); - ret.p[0] = __soft_hfma_sat(a.p[0], b.p[0], c.p[0]); - return ret; -} - -__device__ __half2 __soft_hmul2(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hmul(a.p[1], b.p[1]); - ret.p[0] = __soft_hmul(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hmul2_sat(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hmul_sat(a.p[1], b.p[1]); - ret.p[0] = __soft_hmul_sat(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hneq2(const __half2 a){ - __half2 ret; - ret.p[1] = __soft_hneq(a.p[1]); - ret.p[0] = __soft_hneq(a.p[0]); - return ret; -} - -__device__ __half2 __soft_hsub2(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hsub(a.p[1], b.p[1]); - ret.p[0] = __soft_hsub(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hsub2_sat(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hsub_sat(a.p[1], b.p[1]); - ret.p[0] = __soft_hsub_sat(a.p[0], b.p[0]); - return ret; -} - -/* -Half Cmps -*/ - -__device__ bool __soft_heq(const __half a, const __half b){ - return (a.x == b.x ? true:false); -} - -__device__ bool __soft_hge(const __half a, const __half b){ - return (cvt_half_to_float(a) >= cvt_half_to_float(b)); -} - -__device__ bool __soft_hgt(const __half a, const __half b){ - return (cvt_half_to_float(a) > cvt_half_to_float(b)); -} - -__device__ bool __soft_hisinf(const __half a){ - return ((a.x == __half_neg_inf) ? -1 : (a.x == __half_pos_inf) ? 1 : 0); -} - -__device__ bool __soft_hisnan(const __half a){ - if(((a.x & __half_pos_inf) == a.x) || ((a.x & __half_neg_inf) == a.x)){ - return true; - }else{ - return false; - } -} - -__device__ bool __soft_hle(const __half a, const __half b){ - return (cvt_half_to_float(a) <= cvt_half_to_float(b)); -} - -__device__ bool __soft_hlt(const __half a, const __half b){ - return (cvt_half_to_float(a) < cvt_half_to_float(b)); -} - -__device__ bool __soft_hne(const __half a, const __half b){ - return a.x == b.x ? false : true; -} - -/* -Half2 Cmps -*/ - -__device__ bool __soft_hbeq2(const __half2 a, const __half2 b){ - return __soft_heq(a.p[1], b.p[1]) && __soft_heq(a.p[0], b.p[0]); -} - -__device__ bool __soft_hbge2(const __half2 a, const __half2 b){ - return __soft_hge(a.p[1], b.p[1]) && __soft_hge(a.p[0], b.p[0]); -} - -__device__ bool __soft_hbgt2(const __half2 a, const __half2 b){ - return __soft_hgt(a.p[1], b.p[1]) && __soft_hgt(a.p[0], b.p[0]); -} - -__device__ bool __soft_hble2(const __half2 a, const __half2 b){ - return __soft_hle(a.p[1], b.p[1]) && __soft_hle(a.p[0], b.p[0]); -} - -__device__ bool __soft_hblt2(const __half2 a, const __half2 b){ - return __soft_hlt(a.p[1], b.p[1]) && __soft_hlt(a.p[0], b.p[0]); -} - -__device__ bool __soft_hbne2(const __half2 a, const __half2 b){ - return __soft_hne(a.p[1], b.p[1]) && __soft_hne(a.p[0], b.p[0]); -} - - - -__device__ __half2 __soft_heq2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_heq(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_heq(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hge2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hge(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hge(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hgt2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hgt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hgt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hisnan2(const __half2 a){ - __half2 ret = {0}; - ret.p[1] = __soft_hisnan(a.p[1]) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = __soft_hisnan(a.p[0]) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hle2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hle(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hle(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hlt2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hlt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hlt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hne2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hne(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hne(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -/* -Half Cnvs and Data Mvmnt -*/ - -__device__ __half2 __soft_float22half2_rn(const float2 a){ - __half2 ret = {0}; - ret.p[1] = cvt_float_to_half(a.x); - ret.p[0] = cvt_float_to_half(a.y); - return ret; -} - -__device__ __half __soft_float2half(const float a){ - return cvt_float_to_half(a); -} - -__device__ __half2 __soft_float2half2_rn(const float a){ - __half ret = cvt_float_to_half(a); - return {ret, ret}; -} - -__device__ __half2 __soft_floats2half2_rn(const float a, const float b){ - return {cvt_float_to_half(a), cvt_float_to_half(b)}; -} - -__device__ float2 __soft_half22float2(const __half2 a){ - return {cvt_half_to_float(a.p[1]), cvt_half_to_float(a.p[0])}; -} - -__device__ float __soft_half2float(const __half a){ - return cvt_half_to_float(a); -} - -__device__ __half2 __soft_half2half2(const __half a){ - return {a,a}; -} - -__device__ __half2 __soft_halves2half2(const __half a, const __half b){ - return {a,b}; -} - -__device__ float __soft_high2float(const __half2 a){ - return cvt_half_to_float(a.p[1]); -} - -__device__ __half __soft_high2half(const __half2 a){ - return a.p[1]; -} - -__device__ __half2 __soft_high2half2(const __half2 a){ - return {a.p[1], a.p[1]}; -} - -__device__ __half2 __soft_highs2half2(const __half2 a, const __half2 b){ - return {a.p[1], b.p[1]}; -} - -__device__ float __soft_low2float(const __half2 a){ - return cvt_half_to_float(a.p[0]); -} - -__device__ __half __soft_low2half(const __half2 a){ - return a.p[0]; -} - -__device__ __half2 __soft_low2half2(const __half2 a){ - return {a.p[0], a.p[0]}; -} - -__device__ __half2 __soft_lows2half2(const __half2 a, const __half2 b){ - return {a.p[0], b.p[0]}; -} - -__device__ __half2 __soft_lowhigh2highlow(const __half2 a){ - return {a.p[0], a.p[1]}; -} - -__device__ __half2 __soft_low2half2(const __half2 a, const __half2 b){ - return {a.p[0], b.p[0]}; -} - - - -#endif diff --git a/src/hip_hc_gfx803.ll b/src/hip_hc_gfx803.ll index 0080fc7d81..7e3d0e37dd 100644 --- a/src/hip_hc_gfx803.ll +++ b/src/hip_hc_gfx803.ll @@ -2,89 +2,122 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64: target triple = "amdgcn--amdhsa" -define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hadd2_int(<2 x half> %a, <2 x half> %b) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2) + tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2) + %4 = bitcast i32 %3 to <2 x half> + ret <2 x half> %4 } -define i32 @__hip_hc_ir_hfma2_int(i32 %a, i32 %b, i32 %c) #1 { - %1 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c) - tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %1, i32 %c) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hfma2_int(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = bitcast <2 x half> %c to i32 + %4 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %1, i32 %2, i32 %3) + tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %4, i32 %1, i32 %2) + tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %4, i32 %4, i32 %3) + %5 = bitcast i32 %4 to <2 x half> + ret <2 x half> %5 } -define i32 @__hip_hc_ir_hmul2_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hmul2_int(<2 x half> %a, <2 x half> %b) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2) + tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2) + %4 = bitcast i32 %3 to <2 x half> + ret <2 x half> %4 } -define i32 @__hip_hc_ir_hsub2_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hsub2_int(<2 x half> %a, <2 x half> %b) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2) + tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2) + %4 = bitcast i32 %3 to <2 x half> + ret <2 x half> %4 } -define i32 @__hip_hc_ir_h2ceil_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_ceil_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_ceil_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2ceil_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_ceil_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_ceil_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2cos_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_cos_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_cos_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2cos_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_cos_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_cos_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2exp2_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_exp_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_exp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2exp2_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_exp_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_exp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2floor_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_floor_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_floor_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2floor_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_floor_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_floor_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2log2_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_log_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_log_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2log2_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_log_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_log_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2rcp_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_rcp_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_rcp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2rcp_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_rcp_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_rcp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2rsqrt_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_rsq_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_rsq_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2rsqrt_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_rsq_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_rsq_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2sin_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_sin_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_sin_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2sin_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_sin_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_sin_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2sqrt_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_sqrt_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_sqrt_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2sqrt_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_sqrt_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_sqrt_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2trunc_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_trunc_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_trunc_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2trunc_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_trunc_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_trunc_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } attributes #1 = { alwaysinline nounwind } From 46030bb2d2a30641023dd9a6d53266e32e9a5f9f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 16 May 2017 21:59:14 -0500 Subject: [PATCH 091/171] Return precise address for hipHostGetDevicePointer. --- src/hip_memory.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 5501fec734..fc2ada134e 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -202,7 +202,8 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer); if (status == AM_SUCCESS) { - *devicePointer = amPointerInfo._devicePointer; + *devicePointer = static_cast(amPointerInfo._devicePointer) + (static_cast(hostPointer) - static_cast(amPointerInfo._hostPointer)) ; + tprintf(DB_MEM, " host_ptr=%p returned device_pointer=%p\n", hostPointer, *devicePointer); } else { e = hipErrorMemoryAllocation; } From bdc08fcf10ebedff169ea3611c1b019f7070c829 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 18 May 2017 10:50:56 -0500 Subject: [PATCH 092/171] fixed vector type issues by reverting to old code, changed __half2 to map to vector types in llvm Change-Id: I7317408c25e8c1a0c02a346042c9137e160c8bbd --- include/hip/hcc_detail/hip_fp16.h | 5 +- include/hip/hcc_detail/hip_vector_types.h | 4038 ++++++++++++++++++++- 2 files changed, 3978 insertions(+), 65 deletions(-) diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h index f1f52e4122..a1abce2191 100644 --- a/include/hip/hcc_detail/hip_fp16.h +++ b/include/hip/hcc_detail/hip_fp16.h @@ -25,8 +25,9 @@ THE SOFTWARE. #include "hip/hcc_detail/hip_vector_types.h" -typedef __half half; -typedef __half2 half2; +typedef __fp16 __half; +typedef __fp16 __half1 __attribute__((ext_vector_type(1))); +typedef __fp16 __half2 __attribute__((ext_vector_type(2))); /* Half Arithmetic Functions diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h index 251da504ab..3c3b26c12a 100644 --- a/include/hip/hcc_detail/hip_vector_types.h +++ b/include/hip/hcc_detail/hip_vector_types.h @@ -34,93 +34,1120 @@ THE SOFTWARE. #include "hip/hcc_detail/host_defines.h" -#if __cplusplus +#define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x) { } \ +__device__ __host__ type(const type& val) : x(val.x) { } -typedef unsigned char uchar1 __attribute__((ext_vector_type(1))); -typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); -typedef unsigned char uchar3 __attribute__((ext_vector_type(3))); -typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); +#define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ +__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } -typedef signed char char1 __attribute__((ext_vector_type(1))); -typedef signed char char2 __attribute__((ext_vector_type(2))); -typedef signed char char3 __attribute__((ext_vector_type(3))); -typedef signed char char4 __attribute__((ext_vector_type(4))); +#define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } -typedef unsigned short ushort1 __attribute__((ext_vector_type(1))); -typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); -typedef unsigned short ushort3 __attribute__((ext_vector_type(3))); -typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); +#define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } -typedef signed short short1 __attribute__((ext_vector_type(1))); -typedef signed short short2 __attribute__((ext_vector_type(2))); -typedef signed short short3 __attribute__((ext_vector_type(3))); -typedef signed short short4 __attribute__((ext_vector_type(4))); -typedef __fp16 __half; +#define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val) {} \ -typedef __fp16 __half1 __attribute__((ext_vector_type(1))); -typedef __fp16 __half2 __attribute__((ext_vector_type(2))); -typedef __fp16 __half3 __attribute__((ext_vector_type(3))); -typedef __fp16 __half4 __attribute__((ext_vector_type(4))); +#define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val), y(val) {} \ +__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} -typedef unsigned int uint1 __attribute__((ext_vector_type(1))); -typedef unsigned int uint2 __attribute__((ext_vector_type(2))); -typedef unsigned int uint3 __attribute__((ext_vector_type(3))); -typedef unsigned int uint4 __attribute__((ext_vector_type(4))); +#define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \ +__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} -typedef signed int int1 __attribute__((ext_vector_type(1))); -typedef signed int int2 __attribute__((ext_vector_type(2))); -typedef signed int int3 __attribute__((ext_vector_type(3))); -typedef signed int int4 __attribute__((ext_vector_type(4))); +#define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \ +__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} -typedef float float1 __attribute__((ext_vector_type(1))); -typedef float float2 __attribute__((ext_vector_type(2))); -typedef float float3 __attribute__((ext_vector_type(3))); -typedef float float4 __attribute__((ext_vector_type(4))); +struct uchar1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uchar1) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long long) -typedef unsigned long ulong1 __attribute__((ext_vector_type(1))); -typedef unsigned long ulong2 __attribute__((ext_vector_type(2))); -typedef unsigned long ulong3 __attribute__((ext_vector_type(3))); -typedef unsigned long ulong4 __attribute__((ext_vector_type(4))); + #endif + unsigned char x; -typedef signed long long1 __attribute__((ext_vector_type(1))); -typedef signed long long2 __attribute__((ext_vector_type(2))); -typedef signed long long3 __attribute__((ext_vector_type(3))); -typedef signed long long4 __attribute__((ext_vector_type(4))); +} __attribute__((aligned(1))); -typedef double double1 __attribute__((ext_vector_type(1))); -typedef double double2 __attribute__((ext_vector_type(2))); -typedef double double3 __attribute__((ext_vector_type(3))); -typedef double double4 __attribute__((ext_vector_type(4))); +struct uchar2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uchar2) -typedef unsigned long long ulonglong1 __attribute__((ext_vector_type(1))); -typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); -typedef unsigned long long ulonglong3 __attribute__((ext_vector_type(3))); -typedef unsigned long long ulonglong4 __attribute__((ext_vector_type(4))); + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long long) + #endif + union { + struct { + unsigned char x, y; + }; + unsigned short a; + }; +} __attribute__((aligned(2))); -typedef signed long long longlong1 __attribute__((ext_vector_type(1))); -typedef signed long long longlong2 __attribute__((ext_vector_type(2))); -typedef signed long long longlong3 __attribute__((ext_vector_type(3))); -typedef signed long long longlong4 __attribute__((ext_vector_type(4))); +struct uchar3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uchar3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long long) + #endif + unsigned char x, y, z; +}; + +struct uchar4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uchar4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long long) + #endif + union { + struct { + unsigned char x, y, z, w; + }; + unsigned int a; + }; +} __attribute__((aligned(4))); + + +struct char1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(char1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long long) + #endif + signed char x; +} __attribute__((aligned(1))); + +struct char2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(char2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long long) + #endif + union { + struct { + signed char x, y; + }; + unsigned short a; + }; +} __attribute__((aligned(2))); + +struct char3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(char3) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long long) + #endif + signed char x, y, z; +}; + +struct char4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(char4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long long) + #endif + union { + struct { + signed char x, y, z, w; + }; + unsigned int a; + }; +} __attribute__((aligned(4))); + + + +struct ushort1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ushort1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long long) + #endif + unsigned short x; +} __attribute__((aligned(2))); + +struct ushort2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ushort2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long long) + #endif + union { + struct { + unsigned short x, y; + }; + unsigned int a; + }; +} __attribute__((aligned(4))); + +struct ushort3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ushort3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long long) + #endif + unsigned short x, y, z; +}; + +struct ushort4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ushort4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long long) + #endif + union { + struct { + unsigned short x, y, z, w; + }; + unsigned int a, b; + }; +} __attribute__((aligned(8))); + +struct short1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(short1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long long) + #endif + signed short x; +} __attribute__((aligned(2))); + +struct short2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(short2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long long) + #endif + union { + struct { + signed short x, y; + }; + unsigned int a; + }; + +} __attribute__((aligned(4))); + +struct short3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(short3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long long) + #endif + signed short x, y, z; +}; + +struct short4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(short4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long long) + #endif + union { + struct { + signed short x, y, z, w; + }; + unsigned int a, b; + }; +} __attribute__((aligned(8))); + + +struct uint1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uint1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long long) + #endif + unsigned int x; +} __attribute__((aligned(4))); + +struct uint2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uint2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long long) + #endif + unsigned int x, y; +} __attribute__((aligned(8))); + +struct uint3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uint3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long long) + #endif + unsigned int x, y, z; +}; + +struct uint4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uint4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long long) + #endif + unsigned int x, y, z, w; +} __attribute__((aligned(16))); + +struct int1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(int1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long long) + #endif + signed int x; +} __attribute__((aligned(4))); + +struct int2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(int2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long long) + #endif + signed int x, y; +} __attribute__((aligned(8))); + +struct int3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(int3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long long) + #endif + signed int x, y, z; +}; + +struct int4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(int4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long long) + #endif + signed int x, y, z, w; +} __attribute__((aligned(16))); + + +struct float1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(float1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long long) + #endif + float x; +} __attribute__((aligned(4))); + +struct float2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(float2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long long) + #endif + float x, y; +} __attribute__((aligned(8))); + +struct float3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(float3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long long) + #endif + float x, y, z; +}; + +struct float4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(float4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long long) + #endif + float x, y, z, w; +} __attribute__((aligned(16))); + + + +struct double1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(double1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long long) + #endif + double x; +} __attribute__((aligned(8))); + +struct double2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(double2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long long) + #endif + double x, y; +} __attribute__((aligned(16))); + +struct double3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(double3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long long) + #endif + double x, y, z; +}; + +struct double4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(double4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long long) + #endif + double x, y, z, w; +} __attribute__((aligned(32))); + + +struct ulong1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulong1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long long) + #endif + unsigned long x; +} __attribute__((aligned(8))); + +struct ulong2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulong2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long long) + #endif + unsigned long x, y; +} __attribute__((aligned(16))); + +struct ulong3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulong3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long long) + #endif + unsigned long x, y, z; +}; + +struct ulong4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulong4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long long) + #endif + unsigned long x, y, z, w; +} __attribute__((aligned(32))); + + +struct long1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(long1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long long) + #endif + signed long x; +} __attribute__((aligned(8))); + +struct long2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(long2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long long) + #endif + signed long x, y; +} __attribute__((aligned(16))); + +struct long3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(long3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long long) + #endif + signed long x, y, z; +}; + +struct long4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(long4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long long) + #endif + signed long x, y, z, w; +} __attribute__((aligned(32))); + + +struct ulonglong1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long long) + #endif + unsigned long long x; +} __attribute__((aligned(8))); + +struct ulonglong2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long long) + #endif + unsigned long long x, y; +} __attribute__((aligned(16))); + +struct ulonglong3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long long) + #endif + unsigned long long x, y, z; +}; + +struct ulonglong4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long long) + #endif + unsigned long long x, y, z, w; +} __attribute__((aligned(32))); + + +struct longlong1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(longlong1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long long) + #endif + signed long long x; +} __attribute__((aligned(8))); + +struct longlong2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(longlong2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long long) + #endif + signed long long x, y; +} __attribute__((aligned(16))); + +struct longlong3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(longlong3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long long) + #endif + signed long long x, y, z; +}; + +struct longlong4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(longlong4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long long) + #endif + signed long x, y, z, w; +} __attribute__((aligned(32))); #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x) { \ + struct type ret; \ ret.x = x; \ return ret; \ } #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x, comp y) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x, comp y) { \ + struct type ret; \ ret.x = x; \ ret.y = y; \ return ret; \ } #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x, comp y, comp z) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z) { \ + struct type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -128,8 +1155,8 @@ __device__ __host__ static inline type make_##type(comp x, comp y, comp z) { \ } #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x, comp y, comp z, comp w) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z, comp w) { \ + struct type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -137,7 +1164,6 @@ __device__ __host__ static inline type make_##type(comp x, comp y, comp z, comp return ret; \ } - DECLOP_MAKE_ONE_COMPONENT(unsigned char, uchar1); DECLOP_MAKE_TWO_COMPONENT(unsigned char, uchar2); DECLOP_MAKE_THREE_COMPONENT(unsigned char, uchar3); @@ -199,6 +1225,2892 @@ DECLOP_MAKE_THREE_COMPONENT(signed long, longlong3); DECLOP_MAKE_FOUR_COMPONENT(signed long, longlong4); +#if __cplusplus + +#define DECLOP_1VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + return ret; \ +} + +#define DECLOP_1VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + return ret; \ +} + +#define DECLOP_1VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + return lhs; \ +} + +#define DECLOP_1VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + return val; \ +} + +#define DECLOP_1VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + val.x op; \ + return ret; \ +} + +#define DECLOP_1VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return lhs.x op rhs.x; \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return lhs.x op rhs.x; \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return lhs.x op rhs.x ; \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return lhs.x op rhs.x ; \ +} + +#define DECLOP_1VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type& rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + return ret; \ +} + +#define DECLOP_1VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type& rhs) { \ + return op rhs.x; \ +} + +/* + Two Element Access +*/ + +#define DECLOP_2VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + ret.y = lhs.y op rhs.y; \ + return ret; \ +} + +#define DECLOP_2VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + ret.y = lhs.y * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + ret.y = lhs * rhs.y; \ + return ret; \ +} + +#define DECLOP_2VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + lhs.y op rhs.y; \ + return lhs; \ +} + +#define DECLOP_2VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + op val.y; \ + return val; \ +} + +#define DECLOP_2VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + ret.y = val.y; \ + val.x op; \ + val.y op; \ + return ret; \ +} + +#define DECLOP_2VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} + +#define DECLOP_2VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type &rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + ret.y = op rhs.y; \ + return ret; \ +} + +#define DECLOP_2VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type &rhs) { \ + return (op rhs.x) && (op rhs.y); \ +} + + +/* + Three Element Access +*/ + +#define DECLOP_3VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + ret.y = lhs.y op rhs.y; \ + ret.z = lhs.z op rhs.z; \ + return ret; \ +} + +#define DECLOP_3VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + ret.y = lhs.y * rhs; \ + ret.z = lhs.z * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + ret.y = lhs * rhs.y; \ + ret.z = lhs * rhs.z; \ + return ret; \ +} + +#define DECLOP_3VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + lhs.y op rhs.y; \ + lhs.z op rhs.z; \ + return lhs; \ +} + +#define DECLOP_3VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + op val.y; \ + op val.z; \ + return val; \ +} + +#define DECLOP_3VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + ret.y = val.y; \ + ret.z = val.z; \ + val.x op; \ + val.y op; \ + val.z op; \ + return ret; \ +} + +#define DECLOP_3VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ + +#define DECLOP_3VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type &rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + ret.y = op rhs.y; \ + ret.z = op rhs.z; \ + return ret; \ +} + +#define DECLOP_3VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type &rhs) { \ + return (op rhs.x) && (op rhs.y) && (op rhs.z); \ +} + + +/* + Four Element Access +*/ + +#define DECLOP_4VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op ( const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + ret.y = lhs.y op rhs.y; \ + ret.z = lhs.z op rhs.z; \ + ret.w = lhs.w op rhs.w; \ + return ret; \ +} + +#define DECLOP_4VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + ret.y = lhs.y * rhs; \ + ret.z = lhs.z * rhs; \ + ret.w = lhs.w * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + ret.y = lhs * rhs.y; \ + ret.z = lhs * rhs.z; \ + ret.w = lhs * rhs.w; \ + return ret; \ +} + +#define DECLOP_4VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + lhs.y op rhs.y; \ + lhs.z op rhs.z; \ + lhs.w op rhs.w; \ + return lhs; \ +} + +#define DECLOP_4VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + op val.y; \ + op val.z; \ + op val.w; \ + return val; \ +} + +#define DECLOP_4VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + ret.y = val.y; \ + ret.z = val.z; \ + ret.w = val.w; \ + val.x op; \ + val.y op; \ + val.z op; \ + val.w op; \ + return ret; \ +} + +#define DECLOP_4VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} + +#define DECLOP_4VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type &rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + ret.y = op rhs.y; \ + ret.z = op rhs.z; \ + ret.w = op rhs.w; \ + return ret; \ +} + +#define DECLOP_4VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type &rhs) { \ + return (op rhs.x) && (op rhs.y) && (op rhs.z) && (op rhs.w); \ +} + + +/* +Overloading operators +*/ + +// UNSIGNED CHAR1 + +DECLOP_1VAR_2IN_1OUT(uchar1, +) +DECLOP_1VAR_2IN_1OUT(uchar1, -) +DECLOP_1VAR_2IN_1OUT(uchar1, *) +DECLOP_1VAR_2IN_1OUT(uchar1, /) +DECLOP_1VAR_2IN_1OUT(uchar1, %) +DECLOP_1VAR_2IN_1OUT(uchar1, &) +DECLOP_1VAR_2IN_1OUT(uchar1, |) +DECLOP_1VAR_2IN_1OUT(uchar1, ^) +DECLOP_1VAR_2IN_1OUT(uchar1, <<) +DECLOP_1VAR_2IN_1OUT(uchar1, >>) + + +DECLOP_1VAR_ASSIGN(uchar1, +=) +DECLOP_1VAR_ASSIGN(uchar1, -=) +DECLOP_1VAR_ASSIGN(uchar1, *=) +DECLOP_1VAR_ASSIGN(uchar1, /=) +DECLOP_1VAR_ASSIGN(uchar1, %=) +DECLOP_1VAR_ASSIGN(uchar1, &=) +DECLOP_1VAR_ASSIGN(uchar1, |=) +DECLOP_1VAR_ASSIGN(uchar1, ^=) +DECLOP_1VAR_ASSIGN(uchar1, <<=) +DECLOP_1VAR_ASSIGN(uchar1, >>=) + +DECLOP_1VAR_PREOP(uchar1, ++) +DECLOP_1VAR_PREOP(uchar1, --) + +DECLOP_1VAR_POSTOP(uchar1, ++) +DECLOP_1VAR_POSTOP(uchar1, --) + +DECLOP_1VAR_COMP(uchar1, ==) +DECLOP_1VAR_COMP(uchar1, !=) +DECLOP_1VAR_COMP(uchar1, <) +DECLOP_1VAR_COMP(uchar1, >) +DECLOP_1VAR_COMP(uchar1, <=) +DECLOP_1VAR_COMP(uchar1, >=) + +DECLOP_1VAR_COMP(uchar1, &&) +DECLOP_1VAR_COMP(uchar1, ||) + +DECLOP_1VAR_1IN_1OUT(uchar1, ~) +DECLOP_1VAR_1IN_BOOLOUT(uchar1, !) + +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, float) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, double) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long long) + +// UNSIGNED CHAR2 + +DECLOP_2VAR_2IN_1OUT(uchar2, +) +DECLOP_2VAR_2IN_1OUT(uchar2, -) +DECLOP_2VAR_2IN_1OUT(uchar2, *) +DECLOP_2VAR_2IN_1OUT(uchar2, /) +DECLOP_2VAR_2IN_1OUT(uchar2, %) +DECLOP_2VAR_2IN_1OUT(uchar2, &) +DECLOP_2VAR_2IN_1OUT(uchar2, |) +DECLOP_2VAR_2IN_1OUT(uchar2, ^) +DECLOP_2VAR_2IN_1OUT(uchar2, <<) +DECLOP_2VAR_2IN_1OUT(uchar2, >>) + +DECLOP_2VAR_ASSIGN(uchar2, +=) +DECLOP_2VAR_ASSIGN(uchar2, -=) +DECLOP_2VAR_ASSIGN(uchar2, *=) +DECLOP_2VAR_ASSIGN(uchar2, /=) +DECLOP_2VAR_ASSIGN(uchar2, %=) +DECLOP_2VAR_ASSIGN(uchar2, &=) +DECLOP_2VAR_ASSIGN(uchar2, |=) +DECLOP_2VAR_ASSIGN(uchar2, ^=) +DECLOP_2VAR_ASSIGN(uchar2, <<=) +DECLOP_2VAR_ASSIGN(uchar2, >>=) + +DECLOP_2VAR_PREOP(uchar2, ++) +DECLOP_2VAR_PREOP(uchar2, --) + +DECLOP_2VAR_POSTOP(uchar2, ++) +DECLOP_2VAR_POSTOP(uchar2, --) + +DECLOP_2VAR_COMP(uchar2, ==) +DECLOP_2VAR_COMP(uchar2, !=) +DECLOP_2VAR_COMP(uchar2, <) +DECLOP_2VAR_COMP(uchar2, >) +DECLOP_2VAR_COMP(uchar2, <=) +DECLOP_2VAR_COMP(uchar2, >=) + +DECLOP_2VAR_COMP(uchar2, &&) +DECLOP_2VAR_COMP(uchar2, ||) + +DECLOP_2VAR_1IN_1OUT(uchar2, ~) +DECLOP_2VAR_1IN_BOOLOUT(uchar2, !) + +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, float) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, double) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long long) + +// UNSIGNED CHAR3 + +DECLOP_3VAR_2IN_1OUT(uchar3, +) +DECLOP_3VAR_2IN_1OUT(uchar3, -) +DECLOP_3VAR_2IN_1OUT(uchar3, *) +DECLOP_3VAR_2IN_1OUT(uchar3, /) +DECLOP_3VAR_2IN_1OUT(uchar3, %) +DECLOP_3VAR_2IN_1OUT(uchar3, &) +DECLOP_3VAR_2IN_1OUT(uchar3, |) +DECLOP_3VAR_2IN_1OUT(uchar3, ^) +DECLOP_3VAR_2IN_1OUT(uchar3, <<) +DECLOP_3VAR_2IN_1OUT(uchar3, >>) + +DECLOP_3VAR_ASSIGN(uchar3, +=) +DECLOP_3VAR_ASSIGN(uchar3, -=) +DECLOP_3VAR_ASSIGN(uchar3, *=) +DECLOP_3VAR_ASSIGN(uchar3, /=) +DECLOP_3VAR_ASSIGN(uchar3, %=) +DECLOP_3VAR_ASSIGN(uchar3, &=) +DECLOP_3VAR_ASSIGN(uchar3, |=) +DECLOP_3VAR_ASSIGN(uchar3, ^=) +DECLOP_3VAR_ASSIGN(uchar3, <<=) +DECLOP_3VAR_ASSIGN(uchar3, >>=) + +DECLOP_3VAR_PREOP(uchar3, ++) +DECLOP_3VAR_PREOP(uchar3, --) + +DECLOP_3VAR_POSTOP(uchar3, ++) +DECLOP_3VAR_POSTOP(uchar3, --) + +DECLOP_3VAR_COMP(uchar3, ==) +DECLOP_3VAR_COMP(uchar3, !=) +DECLOP_3VAR_COMP(uchar3, <) +DECLOP_3VAR_COMP(uchar3, >) +DECLOP_3VAR_COMP(uchar3, <=) +DECLOP_3VAR_COMP(uchar3, >=) + +DECLOP_3VAR_COMP(uchar3, &&) +DECLOP_3VAR_COMP(uchar3, ||) + +DECLOP_3VAR_1IN_1OUT(uchar3, ~) +DECLOP_3VAR_1IN_BOOLOUT(uchar3, !) + +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, float) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, double) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long long) + +// UNSIGNED CHAR4 + +DECLOP_4VAR_2IN_1OUT(uchar4, +) +DECLOP_4VAR_2IN_1OUT(uchar4, -) +DECLOP_4VAR_2IN_1OUT(uchar4, *) +DECLOP_4VAR_2IN_1OUT(uchar4, /) +DECLOP_4VAR_2IN_1OUT(uchar4, %) +DECLOP_4VAR_2IN_1OUT(uchar4, &) +DECLOP_4VAR_2IN_1OUT(uchar4, |) +DECLOP_4VAR_2IN_1OUT(uchar4, ^) +DECLOP_4VAR_2IN_1OUT(uchar4, <<) +DECLOP_4VAR_2IN_1OUT(uchar4, >>) + +DECLOP_4VAR_ASSIGN(uchar4, +=) +DECLOP_4VAR_ASSIGN(uchar4, -=) +DECLOP_4VAR_ASSIGN(uchar4, *=) +DECLOP_4VAR_ASSIGN(uchar4, /=) +DECLOP_4VAR_ASSIGN(uchar4, %=) +DECLOP_4VAR_ASSIGN(uchar4, &=) +DECLOP_4VAR_ASSIGN(uchar4, |=) +DECLOP_4VAR_ASSIGN(uchar4, ^=) +DECLOP_4VAR_ASSIGN(uchar4, <<=) +DECLOP_4VAR_ASSIGN(uchar4, >>=) + +DECLOP_4VAR_PREOP(uchar4, ++) +DECLOP_4VAR_PREOP(uchar4, --) + +DECLOP_4VAR_POSTOP(uchar4, ++) +DECLOP_4VAR_POSTOP(uchar4, --) + +DECLOP_4VAR_COMP(uchar4, ==) +DECLOP_4VAR_COMP(uchar4, !=) +DECLOP_4VAR_COMP(uchar4, <) +DECLOP_4VAR_COMP(uchar4, >) +DECLOP_4VAR_COMP(uchar4, <=) +DECLOP_4VAR_COMP(uchar4, >=) + +DECLOP_4VAR_COMP(uchar4, &&) +DECLOP_4VAR_COMP(uchar4, ||) + +DECLOP_4VAR_1IN_1OUT(uchar4, ~) +DECLOP_4VAR_1IN_BOOLOUT(uchar4, !) + +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, float) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, double) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long long) + +// SIGNED CHAR1 + +DECLOP_1VAR_2IN_1OUT(char1, +) +DECLOP_1VAR_2IN_1OUT(char1, -) +DECLOP_1VAR_2IN_1OUT(char1, *) +DECLOP_1VAR_2IN_1OUT(char1, /) +DECLOP_1VAR_2IN_1OUT(char1, %) +DECLOP_1VAR_2IN_1OUT(char1, &) +DECLOP_1VAR_2IN_1OUT(char1, |) +DECLOP_1VAR_2IN_1OUT(char1, ^) +DECLOP_1VAR_2IN_1OUT(char1, <<) +DECLOP_1VAR_2IN_1OUT(char1, >>) + + +DECLOP_1VAR_ASSIGN(char1, +=) +DECLOP_1VAR_ASSIGN(char1, -=) +DECLOP_1VAR_ASSIGN(char1, *=) +DECLOP_1VAR_ASSIGN(char1, /=) +DECLOP_1VAR_ASSIGN(char1, %=) +DECLOP_1VAR_ASSIGN(char1, &=) +DECLOP_1VAR_ASSIGN(char1, |=) +DECLOP_1VAR_ASSIGN(char1, ^=) +DECLOP_1VAR_ASSIGN(char1, <<=) +DECLOP_1VAR_ASSIGN(char1, >>=) + +DECLOP_1VAR_PREOP(char1, ++) +DECLOP_1VAR_PREOP(char1, --) + +DECLOP_1VAR_POSTOP(char1, ++) +DECLOP_1VAR_POSTOP(char1, --) + +DECLOP_1VAR_COMP(char1, ==) +DECLOP_1VAR_COMP(char1, !=) +DECLOP_1VAR_COMP(char1, <) +DECLOP_1VAR_COMP(char1, >) +DECLOP_1VAR_COMP(char1, <=) +DECLOP_1VAR_COMP(char1, >=) + +DECLOP_1VAR_COMP(char1, &&) +DECLOP_1VAR_COMP(char1, ||) + +DECLOP_1VAR_1IN_1OUT(char1, ~) +DECLOP_1VAR_1IN_BOOLOUT(char1, !) + +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(char1, float) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(char1, double) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed long long) + +// SIGNED CHAR2 + +DECLOP_2VAR_2IN_1OUT(char2, +) +DECLOP_2VAR_2IN_1OUT(char2, -) +DECLOP_2VAR_2IN_1OUT(char2, *) +DECLOP_2VAR_2IN_1OUT(char2, /) +DECLOP_2VAR_2IN_1OUT(char2, %) +DECLOP_2VAR_2IN_1OUT(char2, &) +DECLOP_2VAR_2IN_1OUT(char2, |) +DECLOP_2VAR_2IN_1OUT(char2, ^) +DECLOP_2VAR_2IN_1OUT(char2, <<) +DECLOP_2VAR_2IN_1OUT(char2, >>) + +DECLOP_2VAR_ASSIGN(char2, +=) +DECLOP_2VAR_ASSIGN(char2, -=) +DECLOP_2VAR_ASSIGN(char2, *=) +DECLOP_2VAR_ASSIGN(char2, /=) +DECLOP_2VAR_ASSIGN(char2, %=) +DECLOP_2VAR_ASSIGN(char2, &=) +DECLOP_2VAR_ASSIGN(char2, |=) +DECLOP_2VAR_ASSIGN(char2, ^=) +DECLOP_2VAR_ASSIGN(char2, <<=) +DECLOP_2VAR_ASSIGN(char2, >>=) + +DECLOP_2VAR_PREOP(char2, ++) +DECLOP_2VAR_PREOP(char2, --) + +DECLOP_2VAR_POSTOP(char2, ++) +DECLOP_2VAR_POSTOP(char2, --) + +DECLOP_2VAR_COMP(char2, ==) +DECLOP_2VAR_COMP(char2, !=) +DECLOP_2VAR_COMP(char2, <) +DECLOP_2VAR_COMP(char2, >) +DECLOP_2VAR_COMP(char2, <=) +DECLOP_2VAR_COMP(char2, >=) + +DECLOP_2VAR_COMP(char2, &&) +DECLOP_2VAR_COMP(char2, ||) + +DECLOP_2VAR_1IN_1OUT(char2, ~) +DECLOP_2VAR_1IN_BOOLOUT(char2, !) + +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(char2, float) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(char2, double) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed long long) + +// SIGNED CHAR3 + +DECLOP_3VAR_2IN_1OUT(char3, +) +DECLOP_3VAR_2IN_1OUT(char3, -) +DECLOP_3VAR_2IN_1OUT(char3, *) +DECLOP_3VAR_2IN_1OUT(char3, /) +DECLOP_3VAR_2IN_1OUT(char3, %) +DECLOP_3VAR_2IN_1OUT(char3, &) +DECLOP_3VAR_2IN_1OUT(char3, |) +DECLOP_3VAR_2IN_1OUT(char3, ^) +DECLOP_3VAR_2IN_1OUT(char3, <<) +DECLOP_3VAR_2IN_1OUT(char3, >>) + +DECLOP_3VAR_ASSIGN(char3, +=) +DECLOP_3VAR_ASSIGN(char3, -=) +DECLOP_3VAR_ASSIGN(char3, *=) +DECLOP_3VAR_ASSIGN(char3, /=) +DECLOP_3VAR_ASSIGN(char3, %=) +DECLOP_3VAR_ASSIGN(char3, &=) +DECLOP_3VAR_ASSIGN(char3, |=) +DECLOP_3VAR_ASSIGN(char3, ^=) +DECLOP_3VAR_ASSIGN(char3, <<=) +DECLOP_3VAR_ASSIGN(char3, >>=) + +DECLOP_3VAR_PREOP(char3, ++) +DECLOP_3VAR_PREOP(char3, --) + +DECLOP_3VAR_POSTOP(char3, ++) +DECLOP_3VAR_POSTOP(char3, --) + +DECLOP_3VAR_COMP(char3, ==) +DECLOP_3VAR_COMP(char3, !=) +DECLOP_3VAR_COMP(char3, <) +DECLOP_3VAR_COMP(char3, >) +DECLOP_3VAR_COMP(char3, <=) +DECLOP_3VAR_COMP(char3, >=) + +DECLOP_3VAR_COMP(char3, &&) +DECLOP_3VAR_COMP(char3, ||) + +DECLOP_3VAR_1IN_1OUT(char3, ~) +DECLOP_3VAR_1IN_BOOLOUT(char3, !) + +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(char3, float) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(char3, double) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed long long) + +// SIGNED CHAR4 + +DECLOP_4VAR_2IN_1OUT(char4, +) +DECLOP_4VAR_2IN_1OUT(char4, -) +DECLOP_4VAR_2IN_1OUT(char4, *) +DECLOP_4VAR_2IN_1OUT(char4, /) +DECLOP_4VAR_2IN_1OUT(char4, %) +DECLOP_4VAR_2IN_1OUT(char4, &) +DECLOP_4VAR_2IN_1OUT(char4, |) +DECLOP_4VAR_2IN_1OUT(char4, ^) +DECLOP_4VAR_2IN_1OUT(char4, <<) +DECLOP_4VAR_2IN_1OUT(char4, >>) + +DECLOP_4VAR_ASSIGN(char4, +=) +DECLOP_4VAR_ASSIGN(char4, -=) +DECLOP_4VAR_ASSIGN(char4, *=) +DECLOP_4VAR_ASSIGN(char4, /=) +DECLOP_4VAR_ASSIGN(char4, %=) +DECLOP_4VAR_ASSIGN(char4, &=) +DECLOP_4VAR_ASSIGN(char4, |=) +DECLOP_4VAR_ASSIGN(char4, ^=) +DECLOP_4VAR_ASSIGN(char4, <<=) +DECLOP_4VAR_ASSIGN(char4, >>=) + +DECLOP_4VAR_PREOP(char4, ++) +DECLOP_4VAR_PREOP(char4, --) + +DECLOP_4VAR_POSTOP(char4, ++) +DECLOP_4VAR_POSTOP(char4, --) + +DECLOP_4VAR_COMP(char4, ==) +DECLOP_4VAR_COMP(char4, !=) +DECLOP_4VAR_COMP(char4, <) +DECLOP_4VAR_COMP(char4, >) +DECLOP_4VAR_COMP(char4, <=) +DECLOP_4VAR_COMP(char4, >=) + +DECLOP_4VAR_COMP(char4, &&) +DECLOP_4VAR_COMP(char4, ||) + +DECLOP_4VAR_1IN_1OUT(char4, ~) +DECLOP_4VAR_1IN_BOOLOUT(char4, !) + +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(char4, float) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(char4, double) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed long long) + +// UNSIGNED SHORT1 + +DECLOP_1VAR_2IN_1OUT(ushort1, +) +DECLOP_1VAR_2IN_1OUT(ushort1, -) +DECLOP_1VAR_2IN_1OUT(ushort1, *) +DECLOP_1VAR_2IN_1OUT(ushort1, /) +DECLOP_1VAR_2IN_1OUT(ushort1, %) +DECLOP_1VAR_2IN_1OUT(ushort1, &) +DECLOP_1VAR_2IN_1OUT(ushort1, |) +DECLOP_1VAR_2IN_1OUT(ushort1, ^) +DECLOP_1VAR_2IN_1OUT(ushort1, <<) +DECLOP_1VAR_2IN_1OUT(ushort1, >>) + + +DECLOP_1VAR_ASSIGN(ushort1, +=) +DECLOP_1VAR_ASSIGN(ushort1, -=) +DECLOP_1VAR_ASSIGN(ushort1, *=) +DECLOP_1VAR_ASSIGN(ushort1, /=) +DECLOP_1VAR_ASSIGN(ushort1, %=) +DECLOP_1VAR_ASSIGN(ushort1, &=) +DECLOP_1VAR_ASSIGN(ushort1, |=) +DECLOP_1VAR_ASSIGN(ushort1, ^=) +DECLOP_1VAR_ASSIGN(ushort1, <<=) +DECLOP_1VAR_ASSIGN(ushort1, >>=) + +DECLOP_1VAR_PREOP(ushort1, ++) +DECLOP_1VAR_PREOP(ushort1, --) + +DECLOP_1VAR_POSTOP(ushort1, ++) +DECLOP_1VAR_POSTOP(ushort1, --) + +DECLOP_1VAR_COMP(ushort1, ==) +DECLOP_1VAR_COMP(ushort1, !=) +DECLOP_1VAR_COMP(ushort1, <) +DECLOP_1VAR_COMP(ushort1, >) +DECLOP_1VAR_COMP(ushort1, <=) +DECLOP_1VAR_COMP(ushort1, >=) + +DECLOP_1VAR_COMP(ushort1, &&) +DECLOP_1VAR_COMP(ushort1, ||) + +DECLOP_1VAR_1IN_1OUT(ushort1, ~) +DECLOP_1VAR_1IN_BOOLOUT(ushort1, !) + +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, float) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, double) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long long) + +// UNSIGNED SHORT2 + +DECLOP_2VAR_2IN_1OUT(ushort2, +) +DECLOP_2VAR_2IN_1OUT(ushort2, -) +DECLOP_2VAR_2IN_1OUT(ushort2, *) +DECLOP_2VAR_2IN_1OUT(ushort2, /) +DECLOP_2VAR_2IN_1OUT(ushort2, %) +DECLOP_2VAR_2IN_1OUT(ushort2, &) +DECLOP_2VAR_2IN_1OUT(ushort2, |) +DECLOP_2VAR_2IN_1OUT(ushort2, ^) +DECLOP_2VAR_2IN_1OUT(ushort2, <<) +DECLOP_2VAR_2IN_1OUT(ushort2, >>) + +DECLOP_2VAR_ASSIGN(ushort2, +=) +DECLOP_2VAR_ASSIGN(ushort2, -=) +DECLOP_2VAR_ASSIGN(ushort2, *=) +DECLOP_2VAR_ASSIGN(ushort2, /=) +DECLOP_2VAR_ASSIGN(ushort2, %=) +DECLOP_2VAR_ASSIGN(ushort2, &=) +DECLOP_2VAR_ASSIGN(ushort2, |=) +DECLOP_2VAR_ASSIGN(ushort2, ^=) +DECLOP_2VAR_ASSIGN(ushort2, <<=) +DECLOP_2VAR_ASSIGN(ushort2, >>=) + +DECLOP_2VAR_PREOP(ushort2, ++) +DECLOP_2VAR_PREOP(ushort2, --) + +DECLOP_2VAR_POSTOP(ushort2, ++) +DECLOP_2VAR_POSTOP(ushort2, --) + +DECLOP_2VAR_COMP(ushort2, ==) +DECLOP_2VAR_COMP(ushort2, !=) +DECLOP_2VAR_COMP(ushort2, <) +DECLOP_2VAR_COMP(ushort2, >) +DECLOP_2VAR_COMP(ushort2, <=) +DECLOP_2VAR_COMP(ushort2, >=) + +DECLOP_2VAR_COMP(ushort2, &&) +DECLOP_2VAR_COMP(ushort2, ||) + +DECLOP_2VAR_1IN_1OUT(ushort2, ~) +DECLOP_2VAR_1IN_BOOLOUT(ushort2, !) + +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, float) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, double) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long long) + +// UNSIGNED SHORT3 + +DECLOP_3VAR_2IN_1OUT(ushort3, +) +DECLOP_3VAR_2IN_1OUT(ushort3, -) +DECLOP_3VAR_2IN_1OUT(ushort3, *) +DECLOP_3VAR_2IN_1OUT(ushort3, /) +DECLOP_3VAR_2IN_1OUT(ushort3, %) +DECLOP_3VAR_2IN_1OUT(ushort3, &) +DECLOP_3VAR_2IN_1OUT(ushort3, |) +DECLOP_3VAR_2IN_1OUT(ushort3, ^) +DECLOP_3VAR_2IN_1OUT(ushort3, <<) +DECLOP_3VAR_2IN_1OUT(ushort3, >>) + +DECLOP_3VAR_ASSIGN(ushort3, +=) +DECLOP_3VAR_ASSIGN(ushort3, -=) +DECLOP_3VAR_ASSIGN(ushort3, *=) +DECLOP_3VAR_ASSIGN(ushort3, /=) +DECLOP_3VAR_ASSIGN(ushort3, %=) +DECLOP_3VAR_ASSIGN(ushort3, &=) +DECLOP_3VAR_ASSIGN(ushort3, |=) +DECLOP_3VAR_ASSIGN(ushort3, ^=) +DECLOP_3VAR_ASSIGN(ushort3, <<=) +DECLOP_3VAR_ASSIGN(ushort3, >>=) + +DECLOP_3VAR_PREOP(ushort3, ++) +DECLOP_3VAR_PREOP(ushort3, --) + +DECLOP_3VAR_POSTOP(ushort3, ++) +DECLOP_3VAR_POSTOP(ushort3, --) + +DECLOP_3VAR_COMP(ushort3, ==) +DECLOP_3VAR_COMP(ushort3, !=) +DECLOP_3VAR_COMP(ushort3, <) +DECLOP_3VAR_COMP(ushort3, >) +DECLOP_3VAR_COMP(ushort3, <=) +DECLOP_3VAR_COMP(ushort3, >=) + +DECLOP_3VAR_COMP(ushort3, &&) +DECLOP_3VAR_COMP(ushort3, ||) + +DECLOP_3VAR_1IN_1OUT(ushort3, ~) +DECLOP_3VAR_1IN_BOOLOUT(ushort3, !) + +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, float) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, double) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long long) + +// UNSIGNED SHORT4 + +DECLOP_4VAR_2IN_1OUT(ushort4, +) +DECLOP_4VAR_2IN_1OUT(ushort4, -) +DECLOP_4VAR_2IN_1OUT(ushort4, *) +DECLOP_4VAR_2IN_1OUT(ushort4, /) +DECLOP_4VAR_2IN_1OUT(ushort4, %) +DECLOP_4VAR_2IN_1OUT(ushort4, &) +DECLOP_4VAR_2IN_1OUT(ushort4, |) +DECLOP_4VAR_2IN_1OUT(ushort4, ^) +DECLOP_4VAR_2IN_1OUT(ushort4, <<) +DECLOP_4VAR_2IN_1OUT(ushort4, >>) + +DECLOP_4VAR_ASSIGN(ushort4, +=) +DECLOP_4VAR_ASSIGN(ushort4, -=) +DECLOP_4VAR_ASSIGN(ushort4, *=) +DECLOP_4VAR_ASSIGN(ushort4, /=) +DECLOP_4VAR_ASSIGN(ushort4, %=) +DECLOP_4VAR_ASSIGN(ushort4, &=) +DECLOP_4VAR_ASSIGN(ushort4, |=) +DECLOP_4VAR_ASSIGN(ushort4, ^=) +DECLOP_4VAR_ASSIGN(ushort4, <<=) +DECLOP_4VAR_ASSIGN(ushort4, >>=) + +DECLOP_4VAR_PREOP(ushort4, ++) +DECLOP_4VAR_PREOP(ushort4, --) + +DECLOP_4VAR_POSTOP(ushort4, ++) +DECLOP_4VAR_POSTOP(ushort4, --) + +DECLOP_4VAR_COMP(ushort4, ==) +DECLOP_4VAR_COMP(ushort4, !=) +DECLOP_4VAR_COMP(ushort4, <) +DECLOP_4VAR_COMP(ushort4, >) +DECLOP_4VAR_COMP(ushort4, <=) +DECLOP_4VAR_COMP(ushort4, >=) + +DECLOP_4VAR_COMP(ushort4, &&) +DECLOP_4VAR_COMP(ushort4, ||) + +DECLOP_4VAR_1IN_1OUT(ushort4, ~) +DECLOP_4VAR_1IN_BOOLOUT(ushort4, !) + +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, float) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, double) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long long) + +// SIGNED SHORT1 + +DECLOP_1VAR_2IN_1OUT(short1, +) +DECLOP_1VAR_2IN_1OUT(short1, -) +DECLOP_1VAR_2IN_1OUT(short1, *) +DECLOP_1VAR_2IN_1OUT(short1, /) +DECLOP_1VAR_2IN_1OUT(short1, %) +DECLOP_1VAR_2IN_1OUT(short1, &) +DECLOP_1VAR_2IN_1OUT(short1, |) +DECLOP_1VAR_2IN_1OUT(short1, ^) +DECLOP_1VAR_2IN_1OUT(short1, <<) +DECLOP_1VAR_2IN_1OUT(short1, >>) + + +DECLOP_1VAR_ASSIGN(short1, +=) +DECLOP_1VAR_ASSIGN(short1, -=) +DECLOP_1VAR_ASSIGN(short1, *=) +DECLOP_1VAR_ASSIGN(short1, /=) +DECLOP_1VAR_ASSIGN(short1, %=) +DECLOP_1VAR_ASSIGN(short1, &=) +DECLOP_1VAR_ASSIGN(short1, |=) +DECLOP_1VAR_ASSIGN(short1, ^=) +DECLOP_1VAR_ASSIGN(short1, <<=) +DECLOP_1VAR_ASSIGN(short1, >>=) + +DECLOP_1VAR_PREOP(short1, ++) +DECLOP_1VAR_PREOP(short1, --) + +DECLOP_1VAR_POSTOP(short1, ++) +DECLOP_1VAR_POSTOP(short1, --) + +DECLOP_1VAR_COMP(short1, ==) +DECLOP_1VAR_COMP(short1, !=) +DECLOP_1VAR_COMP(short1, <) +DECLOP_1VAR_COMP(short1, >) +DECLOP_1VAR_COMP(short1, <=) +DECLOP_1VAR_COMP(short1, >=) + +DECLOP_1VAR_COMP(short1, &&) +DECLOP_1VAR_COMP(short1, ||) + +DECLOP_1VAR_1IN_1OUT(short1, ~) +DECLOP_1VAR_1IN_BOOLOUT(short1, !) + +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(short1, float) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(short1, double) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed long long) + +// SIGNED SHORT2 + +DECLOP_2VAR_2IN_1OUT(short2, +) +DECLOP_2VAR_2IN_1OUT(short2, -) +DECLOP_2VAR_2IN_1OUT(short2, *) +DECLOP_2VAR_2IN_1OUT(short2, /) +DECLOP_2VAR_2IN_1OUT(short2, %) +DECLOP_2VAR_2IN_1OUT(short2, &) +DECLOP_2VAR_2IN_1OUT(short2, |) +DECLOP_2VAR_2IN_1OUT(short2, ^) +DECLOP_2VAR_2IN_1OUT(short2, <<) +DECLOP_2VAR_2IN_1OUT(short2, >>) + +DECLOP_2VAR_ASSIGN(short2, +=) +DECLOP_2VAR_ASSIGN(short2, -=) +DECLOP_2VAR_ASSIGN(short2, *=) +DECLOP_2VAR_ASSIGN(short2, /=) +DECLOP_2VAR_ASSIGN(short2, %=) +DECLOP_2VAR_ASSIGN(short2, &=) +DECLOP_2VAR_ASSIGN(short2, |=) +DECLOP_2VAR_ASSIGN(short2, ^=) +DECLOP_2VAR_ASSIGN(short2, <<=) +DECLOP_2VAR_ASSIGN(short2, >>=) + +DECLOP_2VAR_PREOP(short2, ++) +DECLOP_2VAR_PREOP(short2, --) + +DECLOP_2VAR_POSTOP(short2, ++) +DECLOP_2VAR_POSTOP(short2, --) + +DECLOP_2VAR_COMP(short2, ==) +DECLOP_2VAR_COMP(short2, !=) +DECLOP_2VAR_COMP(short2, <) +DECLOP_2VAR_COMP(short2, >) +DECLOP_2VAR_COMP(short2, <=) +DECLOP_2VAR_COMP(short2, >=) + +DECLOP_2VAR_COMP(short2, &&) +DECLOP_2VAR_COMP(short2, ||) + +DECLOP_2VAR_1IN_1OUT(short2, ~) +DECLOP_2VAR_1IN_BOOLOUT(short2, !) + +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(short2, float) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(short2, double) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed long long) + +// SIGNED SHORT3 + +DECLOP_3VAR_2IN_1OUT(short3, +) +DECLOP_3VAR_2IN_1OUT(short3, -) +DECLOP_3VAR_2IN_1OUT(short3, *) +DECLOP_3VAR_2IN_1OUT(short3, /) +DECLOP_3VAR_2IN_1OUT(short3, %) +DECLOP_3VAR_2IN_1OUT(short3, &) +DECLOP_3VAR_2IN_1OUT(short3, |) +DECLOP_3VAR_2IN_1OUT(short3, ^) +DECLOP_3VAR_2IN_1OUT(short3, <<) +DECLOP_3VAR_2IN_1OUT(short3, >>) + +DECLOP_3VAR_ASSIGN(short3, +=) +DECLOP_3VAR_ASSIGN(short3, -=) +DECLOP_3VAR_ASSIGN(short3, *=) +DECLOP_3VAR_ASSIGN(short3, /=) +DECLOP_3VAR_ASSIGN(short3, %=) +DECLOP_3VAR_ASSIGN(short3, &=) +DECLOP_3VAR_ASSIGN(short3, |=) +DECLOP_3VAR_ASSIGN(short3, ^=) +DECLOP_3VAR_ASSIGN(short3, <<=) +DECLOP_3VAR_ASSIGN(short3, >>=) + +DECLOP_3VAR_PREOP(short3, ++) +DECLOP_3VAR_PREOP(short3, --) + +DECLOP_3VAR_POSTOP(short3, ++) +DECLOP_3VAR_POSTOP(short3, --) + +DECLOP_3VAR_COMP(short3, ==) +DECLOP_3VAR_COMP(short3, !=) +DECLOP_3VAR_COMP(short3, <) +DECLOP_3VAR_COMP(short3, >) +DECLOP_3VAR_COMP(short3, <=) +DECLOP_3VAR_COMP(short3, >=) + +DECLOP_3VAR_COMP(short3, &&) +DECLOP_3VAR_COMP(short3, ||) + +DECLOP_3VAR_1IN_1OUT(short3, ~) +DECLOP_3VAR_1IN_BOOLOUT(short3, !) + +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(short3, float) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(short3, double) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed long long) + +// SIGNED SHORT4 + +DECLOP_4VAR_2IN_1OUT(short4, +) +DECLOP_4VAR_2IN_1OUT(short4, -) +DECLOP_4VAR_2IN_1OUT(short4, *) +DECLOP_4VAR_2IN_1OUT(short4, /) +DECLOP_4VAR_2IN_1OUT(short4, %) +DECLOP_4VAR_2IN_1OUT(short4, &) +DECLOP_4VAR_2IN_1OUT(short4, |) +DECLOP_4VAR_2IN_1OUT(short4, ^) +DECLOP_4VAR_2IN_1OUT(short4, <<) +DECLOP_4VAR_2IN_1OUT(short4, >>) + +DECLOP_4VAR_ASSIGN(short4, +=) +DECLOP_4VAR_ASSIGN(short4, -=) +DECLOP_4VAR_ASSIGN(short4, *=) +DECLOP_4VAR_ASSIGN(short4, /=) +DECLOP_4VAR_ASSIGN(short4, %=) +DECLOP_4VAR_ASSIGN(short4, &=) +DECLOP_4VAR_ASSIGN(short4, |=) +DECLOP_4VAR_ASSIGN(short4, ^=) +DECLOP_4VAR_ASSIGN(short4, <<=) +DECLOP_4VAR_ASSIGN(short4, >>=) + +DECLOP_4VAR_PREOP(short4, ++) +DECLOP_4VAR_PREOP(short4, --) + +DECLOP_4VAR_POSTOP(short4, ++) +DECLOP_4VAR_POSTOP(short4, --) + +DECLOP_4VAR_COMP(short4, ==) +DECLOP_4VAR_COMP(short4, !=) +DECLOP_4VAR_COMP(short4, <) +DECLOP_4VAR_COMP(short4, >) +DECLOP_4VAR_COMP(short4, <=) +DECLOP_4VAR_COMP(short4, >=) + +DECLOP_4VAR_COMP(short4, &&) +DECLOP_4VAR_COMP(short4, ||) + +DECLOP_4VAR_1IN_1OUT(short4, ~) +DECLOP_4VAR_1IN_BOOLOUT(short4, !) + +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(short4, float) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(short4, double) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed long long) + +// UNSIGNED INT1 + +DECLOP_1VAR_2IN_1OUT(uint1, +) +DECLOP_1VAR_2IN_1OUT(uint1, -) +DECLOP_1VAR_2IN_1OUT(uint1, *) +DECLOP_1VAR_2IN_1OUT(uint1, /) +DECLOP_1VAR_2IN_1OUT(uint1, %) +DECLOP_1VAR_2IN_1OUT(uint1, &) +DECLOP_1VAR_2IN_1OUT(uint1, |) +DECLOP_1VAR_2IN_1OUT(uint1, ^) +DECLOP_1VAR_2IN_1OUT(uint1, <<) +DECLOP_1VAR_2IN_1OUT(uint1, >>) + + +DECLOP_1VAR_ASSIGN(uint1, +=) +DECLOP_1VAR_ASSIGN(uint1, -=) +DECLOP_1VAR_ASSIGN(uint1, *=) +DECLOP_1VAR_ASSIGN(uint1, /=) +DECLOP_1VAR_ASSIGN(uint1, %=) +DECLOP_1VAR_ASSIGN(uint1, &=) +DECLOP_1VAR_ASSIGN(uint1, |=) +DECLOP_1VAR_ASSIGN(uint1, ^=) +DECLOP_1VAR_ASSIGN(uint1, <<=) +DECLOP_1VAR_ASSIGN(uint1, >>=) + +DECLOP_1VAR_PREOP(uint1, ++) +DECLOP_1VAR_PREOP(uint1, --) + +DECLOP_1VAR_POSTOP(uint1, ++) +DECLOP_1VAR_POSTOP(uint1, --) + +DECLOP_1VAR_COMP(uint1, ==) +DECLOP_1VAR_COMP(uint1, !=) +DECLOP_1VAR_COMP(uint1, <) +DECLOP_1VAR_COMP(uint1, >) +DECLOP_1VAR_COMP(uint1, <=) +DECLOP_1VAR_COMP(uint1, >=) + +DECLOP_1VAR_COMP(uint1, &&) +DECLOP_1VAR_COMP(uint1, ||) + +DECLOP_1VAR_1IN_1OUT(uint1, ~) +DECLOP_1VAR_1IN_BOOLOUT(uint1, !) + +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(uint1, float) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(uint1, double) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long long) + +// UNSIGNED INT2 + +DECLOP_2VAR_2IN_1OUT(uint2, +) +DECLOP_2VAR_2IN_1OUT(uint2, -) +DECLOP_2VAR_2IN_1OUT(uint2, *) +DECLOP_2VAR_2IN_1OUT(uint2, /) +DECLOP_2VAR_2IN_1OUT(uint2, %) +DECLOP_2VAR_2IN_1OUT(uint2, &) +DECLOP_2VAR_2IN_1OUT(uint2, |) +DECLOP_2VAR_2IN_1OUT(uint2, ^) +DECLOP_2VAR_2IN_1OUT(uint2, <<) +DECLOP_2VAR_2IN_1OUT(uint2, >>) + +DECLOP_2VAR_ASSIGN(uint2, +=) +DECLOP_2VAR_ASSIGN(uint2, -=) +DECLOP_2VAR_ASSIGN(uint2, *=) +DECLOP_2VAR_ASSIGN(uint2, /=) +DECLOP_2VAR_ASSIGN(uint2, %=) +DECLOP_2VAR_ASSIGN(uint2, &=) +DECLOP_2VAR_ASSIGN(uint2, |=) +DECLOP_2VAR_ASSIGN(uint2, ^=) +DECLOP_2VAR_ASSIGN(uint2, <<=) +DECLOP_2VAR_ASSIGN(uint2, >>=) + +DECLOP_2VAR_PREOP(uint2, ++) +DECLOP_2VAR_PREOP(uint2, --) + +DECLOP_2VAR_POSTOP(uint2, ++) +DECLOP_2VAR_POSTOP(uint2, --) + +DECLOP_2VAR_COMP(uint2, ==) +DECLOP_2VAR_COMP(uint2, !=) +DECLOP_2VAR_COMP(uint2, <) +DECLOP_2VAR_COMP(uint2, >) +DECLOP_2VAR_COMP(uint2, <=) +DECLOP_2VAR_COMP(uint2, >=) + +DECLOP_2VAR_COMP(uint2, &&) +DECLOP_2VAR_COMP(uint2, ||) + +DECLOP_2VAR_1IN_1OUT(uint2, ~) +DECLOP_2VAR_1IN_BOOLOUT(uint2, !) + +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(uint2, float) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(uint2, double) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long long) + +// UNSIGNED INT3 + +DECLOP_3VAR_2IN_1OUT(uint3, +) +DECLOP_3VAR_2IN_1OUT(uint3, -) +DECLOP_3VAR_2IN_1OUT(uint3, *) +DECLOP_3VAR_2IN_1OUT(uint3, /) +DECLOP_3VAR_2IN_1OUT(uint3, %) +DECLOP_3VAR_2IN_1OUT(uint3, &) +DECLOP_3VAR_2IN_1OUT(uint3, |) +DECLOP_3VAR_2IN_1OUT(uint3, ^) +DECLOP_3VAR_2IN_1OUT(uint3, <<) +DECLOP_3VAR_2IN_1OUT(uint3, >>) + +DECLOP_3VAR_ASSIGN(uint3, +=) +DECLOP_3VAR_ASSIGN(uint3, -=) +DECLOP_3VAR_ASSIGN(uint3, *=) +DECLOP_3VAR_ASSIGN(uint3, /=) +DECLOP_3VAR_ASSIGN(uint3, %=) +DECLOP_3VAR_ASSIGN(uint3, &=) +DECLOP_3VAR_ASSIGN(uint3, |=) +DECLOP_3VAR_ASSIGN(uint3, ^=) +DECLOP_3VAR_ASSIGN(uint3, <<=) +DECLOP_3VAR_ASSIGN(uint3, >>=) + +DECLOP_3VAR_PREOP(uint3, ++) +DECLOP_3VAR_PREOP(uint3, --) + +DECLOP_3VAR_POSTOP(uint3, ++) +DECLOP_3VAR_POSTOP(uint3, --) + +DECLOP_3VAR_COMP(uint3, ==) +DECLOP_3VAR_COMP(uint3, !=) +DECLOP_3VAR_COMP(uint3, <) +DECLOP_3VAR_COMP(uint3, >) +DECLOP_3VAR_COMP(uint3, <=) +DECLOP_3VAR_COMP(uint3, >=) + +DECLOP_3VAR_COMP(uint3, &&) +DECLOP_3VAR_COMP(uint3, ||) + +DECLOP_3VAR_1IN_1OUT(uint3, ~) +DECLOP_3VAR_1IN_BOOLOUT(uint3, !) + +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(uint3, float) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(uint3, double) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long long) + +// UNSIGNED INT4 + +DECLOP_4VAR_2IN_1OUT(uint4, +) +DECLOP_4VAR_2IN_1OUT(uint4, -) +DECLOP_4VAR_2IN_1OUT(uint4, *) +DECLOP_4VAR_2IN_1OUT(uint4, /) +DECLOP_4VAR_2IN_1OUT(uint4, %) +DECLOP_4VAR_2IN_1OUT(uint4, &) +DECLOP_4VAR_2IN_1OUT(uint4, |) +DECLOP_4VAR_2IN_1OUT(uint4, ^) +DECLOP_4VAR_2IN_1OUT(uint4, <<) +DECLOP_4VAR_2IN_1OUT(uint4, >>) + +DECLOP_4VAR_ASSIGN(uint4, +=) +DECLOP_4VAR_ASSIGN(uint4, -=) +DECLOP_4VAR_ASSIGN(uint4, *=) +DECLOP_4VAR_ASSIGN(uint4, /=) +DECLOP_4VAR_ASSIGN(uint4, %=) +DECLOP_4VAR_ASSIGN(uint4, &=) +DECLOP_4VAR_ASSIGN(uint4, |=) +DECLOP_4VAR_ASSIGN(uint4, ^=) +DECLOP_4VAR_ASSIGN(uint4, <<=) +DECLOP_4VAR_ASSIGN(uint4, >>=) + +DECLOP_4VAR_PREOP(uint4, ++) +DECLOP_4VAR_PREOP(uint4, --) + +DECLOP_4VAR_POSTOP(uint4, ++) +DECLOP_4VAR_POSTOP(uint4, --) + +DECLOP_4VAR_COMP(uint4, ==) +DECLOP_4VAR_COMP(uint4, !=) +DECLOP_4VAR_COMP(uint4, <) +DECLOP_4VAR_COMP(uint4, >) +DECLOP_4VAR_COMP(uint4, <=) +DECLOP_4VAR_COMP(uint4, >=) + +DECLOP_4VAR_COMP(uint4, &&) +DECLOP_4VAR_COMP(uint4, ||) + +DECLOP_4VAR_1IN_1OUT(uint4, ~) +DECLOP_4VAR_1IN_BOOLOUT(uint4, !) + +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(uint4, float) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(uint4, double) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long long) + +// SIGNED INT1 + +DECLOP_1VAR_2IN_1OUT(int1, +) +DECLOP_1VAR_2IN_1OUT(int1, -) +DECLOP_1VAR_2IN_1OUT(int1, *) +DECLOP_1VAR_2IN_1OUT(int1, /) +DECLOP_1VAR_2IN_1OUT(int1, %) +DECLOP_1VAR_2IN_1OUT(int1, &) +DECLOP_1VAR_2IN_1OUT(int1, |) +DECLOP_1VAR_2IN_1OUT(int1, ^) +DECLOP_1VAR_2IN_1OUT(int1, <<) +DECLOP_1VAR_2IN_1OUT(int1, >>) + + +DECLOP_1VAR_ASSIGN(int1, +=) +DECLOP_1VAR_ASSIGN(int1, -=) +DECLOP_1VAR_ASSIGN(int1, *=) +DECLOP_1VAR_ASSIGN(int1, /=) +DECLOP_1VAR_ASSIGN(int1, %=) +DECLOP_1VAR_ASSIGN(int1, &=) +DECLOP_1VAR_ASSIGN(int1, |=) +DECLOP_1VAR_ASSIGN(int1, ^=) +DECLOP_1VAR_ASSIGN(int1, <<=) +DECLOP_1VAR_ASSIGN(int1, >>=) + +DECLOP_1VAR_PREOP(int1, ++) +DECLOP_1VAR_PREOP(int1, --) + +DECLOP_1VAR_POSTOP(int1, ++) +DECLOP_1VAR_POSTOP(int1, --) + +DECLOP_1VAR_COMP(int1, ==) +DECLOP_1VAR_COMP(int1, !=) +DECLOP_1VAR_COMP(int1, <) +DECLOP_1VAR_COMP(int1, >) +DECLOP_1VAR_COMP(int1, <=) +DECLOP_1VAR_COMP(int1, >=) + +DECLOP_1VAR_COMP(int1, &&) +DECLOP_1VAR_COMP(int1, ||) + +DECLOP_1VAR_1IN_1OUT(int1, ~) +DECLOP_1VAR_1IN_BOOLOUT(int1, !) + +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(int1, float) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(int1, double) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed long long) + +// SIGNED INT2 + +DECLOP_2VAR_2IN_1OUT(int2, +) +DECLOP_2VAR_2IN_1OUT(int2, -) +DECLOP_2VAR_2IN_1OUT(int2, *) +DECLOP_2VAR_2IN_1OUT(int2, /) +DECLOP_2VAR_2IN_1OUT(int2, %) +DECLOP_2VAR_2IN_1OUT(int2, &) +DECLOP_2VAR_2IN_1OUT(int2, |) +DECLOP_2VAR_2IN_1OUT(int2, ^) +DECLOP_2VAR_2IN_1OUT(int2, <<) +DECLOP_2VAR_2IN_1OUT(int2, >>) + +DECLOP_2VAR_ASSIGN(int2, +=) +DECLOP_2VAR_ASSIGN(int2, -=) +DECLOP_2VAR_ASSIGN(int2, *=) +DECLOP_2VAR_ASSIGN(int2, /=) +DECLOP_2VAR_ASSIGN(int2, %=) +DECLOP_2VAR_ASSIGN(int2, &=) +DECLOP_2VAR_ASSIGN(int2, |=) +DECLOP_2VAR_ASSIGN(int2, ^=) +DECLOP_2VAR_ASSIGN(int2, <<=) +DECLOP_2VAR_ASSIGN(int2, >>=) + +DECLOP_2VAR_PREOP(int2, ++) +DECLOP_2VAR_PREOP(int2, --) + +DECLOP_2VAR_POSTOP(int2, ++) +DECLOP_2VAR_POSTOP(int2, --) + +DECLOP_2VAR_COMP(int2, ==) +DECLOP_2VAR_COMP(int2, !=) +DECLOP_2VAR_COMP(int2, <) +DECLOP_2VAR_COMP(int2, >) +DECLOP_2VAR_COMP(int2, <=) +DECLOP_2VAR_COMP(int2, >=) + +DECLOP_2VAR_COMP(int2, &&) +DECLOP_2VAR_COMP(int2, ||) + +DECLOP_2VAR_1IN_1OUT(int2, ~) +DECLOP_2VAR_1IN_BOOLOUT(int2, !) + +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(int2, float) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(int2, double) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed long long) + +// SIGNED INT3 + +DECLOP_3VAR_2IN_1OUT(int3, +) +DECLOP_3VAR_2IN_1OUT(int3, -) +DECLOP_3VAR_2IN_1OUT(int3, *) +DECLOP_3VAR_2IN_1OUT(int3, /) +DECLOP_3VAR_2IN_1OUT(int3, %) +DECLOP_3VAR_2IN_1OUT(int3, &) +DECLOP_3VAR_2IN_1OUT(int3, |) +DECLOP_3VAR_2IN_1OUT(int3, ^) +DECLOP_3VAR_2IN_1OUT(int3, <<) +DECLOP_3VAR_2IN_1OUT(int3, >>) + +DECLOP_3VAR_ASSIGN(int3, +=) +DECLOP_3VAR_ASSIGN(int3, -=) +DECLOP_3VAR_ASSIGN(int3, *=) +DECLOP_3VAR_ASSIGN(int3, /=) +DECLOP_3VAR_ASSIGN(int3, %=) +DECLOP_3VAR_ASSIGN(int3, &=) +DECLOP_3VAR_ASSIGN(int3, |=) +DECLOP_3VAR_ASSIGN(int3, ^=) +DECLOP_3VAR_ASSIGN(int3, <<=) +DECLOP_3VAR_ASSIGN(int3, >>=) + +DECLOP_3VAR_PREOP(int3, ++) +DECLOP_3VAR_PREOP(int3, --) + +DECLOP_3VAR_POSTOP(int3, ++) +DECLOP_3VAR_POSTOP(int3, --) + +DECLOP_3VAR_COMP(int3, ==) +DECLOP_3VAR_COMP(int3, !=) +DECLOP_3VAR_COMP(int3, <) +DECLOP_3VAR_COMP(int3, >) +DECLOP_3VAR_COMP(int3, <=) +DECLOP_3VAR_COMP(int3, >=) + +DECLOP_3VAR_COMP(int3, &&) +DECLOP_3VAR_COMP(int3, ||) + +DECLOP_3VAR_1IN_1OUT(int3, ~) +DECLOP_3VAR_1IN_BOOLOUT(int3, !) + +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(int3, float) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(int3, double) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed long long) + +// SIGNED INT4 + +DECLOP_4VAR_2IN_1OUT(int4, +) +DECLOP_4VAR_2IN_1OUT(int4, -) +DECLOP_4VAR_2IN_1OUT(int4, *) +DECLOP_4VAR_2IN_1OUT(int4, /) +DECLOP_4VAR_2IN_1OUT(int4, %) +DECLOP_4VAR_2IN_1OUT(int4, &) +DECLOP_4VAR_2IN_1OUT(int4, |) +DECLOP_4VAR_2IN_1OUT(int4, ^) +DECLOP_4VAR_2IN_1OUT(int4, <<) +DECLOP_4VAR_2IN_1OUT(int4, >>) + +DECLOP_4VAR_ASSIGN(int4, +=) +DECLOP_4VAR_ASSIGN(int4, -=) +DECLOP_4VAR_ASSIGN(int4, *=) +DECLOP_4VAR_ASSIGN(int4, /=) +DECLOP_4VAR_ASSIGN(int4, %=) +DECLOP_4VAR_ASSIGN(int4, &=) +DECLOP_4VAR_ASSIGN(int4, |=) +DECLOP_4VAR_ASSIGN(int4, ^=) +DECLOP_4VAR_ASSIGN(int4, <<=) +DECLOP_4VAR_ASSIGN(int4, >>=) + +DECLOP_4VAR_PREOP(int4, ++) +DECLOP_4VAR_PREOP(int4, --) + +DECLOP_4VAR_POSTOP(int4, ++) +DECLOP_4VAR_POSTOP(int4, --) + +DECLOP_4VAR_COMP(int4, ==) +DECLOP_4VAR_COMP(int4, !=) +DECLOP_4VAR_COMP(int4, <) +DECLOP_4VAR_COMP(int4, >) +DECLOP_4VAR_COMP(int4, <=) +DECLOP_4VAR_COMP(int4, >=) + +DECLOP_4VAR_COMP(int4, &&) +DECLOP_4VAR_COMP(int4, ||) + +DECLOP_4VAR_1IN_1OUT(int4, ~) +DECLOP_4VAR_1IN_BOOLOUT(int4, !) + +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(int4, float) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(int4, double) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed long long) + +// FLOAT1 + +DECLOP_1VAR_2IN_1OUT(float1, +) +DECLOP_1VAR_2IN_1OUT(float1, -) +DECLOP_1VAR_2IN_1OUT(float1, *) +DECLOP_1VAR_2IN_1OUT(float1, /) + +DECLOP_1VAR_ASSIGN(float1, +=) +DECLOP_1VAR_ASSIGN(float1, -=) +DECLOP_1VAR_ASSIGN(float1, *=) +DECLOP_1VAR_ASSIGN(float1, /=) + +DECLOP_1VAR_PREOP(float1, ++) +DECLOP_1VAR_PREOP(float1, --) + +DECLOP_1VAR_POSTOP(float1, ++) +DECLOP_1VAR_POSTOP(float1, --) + +DECLOP_1VAR_COMP(float1, ==) +DECLOP_1VAR_COMP(float1, !=) +DECLOP_1VAR_COMP(float1, <) +DECLOP_1VAR_COMP(float1, >) +DECLOP_1VAR_COMP(float1, <=) +DECLOP_1VAR_COMP(float1, >=) + +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(float1, float) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(float1, double) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed long long) + +// FLOAT2 + +DECLOP_2VAR_2IN_1OUT(float2, +) +DECLOP_2VAR_2IN_1OUT(float2, -) +DECLOP_2VAR_2IN_1OUT(float2, *) +DECLOP_2VAR_2IN_1OUT(float2, /) + +DECLOP_2VAR_ASSIGN(float2, +=) +DECLOP_2VAR_ASSIGN(float2, -=) +DECLOP_2VAR_ASSIGN(float2, *=) +DECLOP_2VAR_ASSIGN(float2, /=) + +DECLOP_2VAR_PREOP(float2, ++) +DECLOP_2VAR_PREOP(float2, --) + +DECLOP_2VAR_POSTOP(float2, ++) +DECLOP_2VAR_POSTOP(float2, --) + +DECLOP_2VAR_COMP(float2, ==) +DECLOP_2VAR_COMP(float2, !=) +DECLOP_2VAR_COMP(float2, <) +DECLOP_2VAR_COMP(float2, >) +DECLOP_2VAR_COMP(float2, <=) +DECLOP_2VAR_COMP(float2, >=) + +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(float2, float) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(float2, double) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed long long) + +// FLOAT3 + +DECLOP_3VAR_2IN_1OUT(float3, +) +DECLOP_3VAR_2IN_1OUT(float3, -) +DECLOP_3VAR_2IN_1OUT(float3, *) +DECLOP_3VAR_2IN_1OUT(float3, /) + +DECLOP_3VAR_ASSIGN(float3, +=) +DECLOP_3VAR_ASSIGN(float3, -=) +DECLOP_3VAR_ASSIGN(float3, *=) +DECLOP_3VAR_ASSIGN(float3, /=) + +DECLOP_3VAR_PREOP(float3, ++) +DECLOP_3VAR_PREOP(float3, --) + +DECLOP_3VAR_POSTOP(float3, ++) +DECLOP_3VAR_POSTOP(float3, --) + +DECLOP_3VAR_COMP(float3, ==) +DECLOP_3VAR_COMP(float3, !=) +DECLOP_3VAR_COMP(float3, <) +DECLOP_3VAR_COMP(float3, >) +DECLOP_3VAR_COMP(float3, <=) +DECLOP_3VAR_COMP(float3, >=) + +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(float3, float) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(float3, double) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed long long) + +// FLOAT4 + +DECLOP_4VAR_2IN_1OUT(float4, +) +DECLOP_4VAR_2IN_1OUT(float4, -) +DECLOP_4VAR_2IN_1OUT(float4, *) +DECLOP_4VAR_2IN_1OUT(float4, /) + +DECLOP_4VAR_ASSIGN(float4, +=) +DECLOP_4VAR_ASSIGN(float4, -=) +DECLOP_4VAR_ASSIGN(float4, *=) +DECLOP_4VAR_ASSIGN(float4, /=) + +DECLOP_4VAR_PREOP(float4, ++) +DECLOP_4VAR_PREOP(float4, --) + +DECLOP_4VAR_POSTOP(float4, ++) +DECLOP_4VAR_POSTOP(float4, --) + +DECLOP_4VAR_COMP(float4, ==) +DECLOP_4VAR_COMP(float4, !=) +DECLOP_4VAR_COMP(float4, <) +DECLOP_4VAR_COMP(float4, >) +DECLOP_4VAR_COMP(float4, <=) +DECLOP_4VAR_COMP(float4, >=) + +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(float4, float) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(float4, double) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed long long) + +// DOUBLE1 + +DECLOP_1VAR_2IN_1OUT(double1, +) +DECLOP_1VAR_2IN_1OUT(double1, -) +DECLOP_1VAR_2IN_1OUT(double1, *) +DECLOP_1VAR_2IN_1OUT(double1, /) + +DECLOP_1VAR_ASSIGN(double1, +=) +DECLOP_1VAR_ASSIGN(double1, -=) +DECLOP_1VAR_ASSIGN(double1, *=) +DECLOP_1VAR_ASSIGN(double1, /=) + +DECLOP_1VAR_PREOP(double1, ++) +DECLOP_1VAR_PREOP(double1, --) + +DECLOP_1VAR_POSTOP(double1, ++) +DECLOP_1VAR_POSTOP(double1, --) + +DECLOP_1VAR_COMP(double1, ==) +DECLOP_1VAR_COMP(double1, !=) +DECLOP_1VAR_COMP(double1, <) +DECLOP_1VAR_COMP(double1, >) +DECLOP_1VAR_COMP(double1, <=) +DECLOP_1VAR_COMP(double1, >=) + +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(double1, float) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(double1, double) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed long long) + +// DOUBLE2 + +DECLOP_2VAR_2IN_1OUT(double2, +) +DECLOP_2VAR_2IN_1OUT(double2, -) +DECLOP_2VAR_2IN_1OUT(double2, *) +DECLOP_2VAR_2IN_1OUT(double2, /) + +DECLOP_2VAR_ASSIGN(double2, +=) +DECLOP_2VAR_ASSIGN(double2, -=) +DECLOP_2VAR_ASSIGN(double2, *=) +DECLOP_2VAR_ASSIGN(double2, /=) + +DECLOP_2VAR_PREOP(double2, ++) +DECLOP_2VAR_PREOP(double2, --) + +DECLOP_2VAR_POSTOP(double2, ++) +DECLOP_2VAR_POSTOP(double2, --) + +DECLOP_2VAR_COMP(double2, ==) +DECLOP_2VAR_COMP(double2, !=) +DECLOP_2VAR_COMP(double2, <) +DECLOP_2VAR_COMP(double2, >) +DECLOP_2VAR_COMP(double2, <=) +DECLOP_2VAR_COMP(double2, >=) + +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(double2, float) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(double2, double) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed long long) + +// DOUBLE3 + +DECLOP_3VAR_2IN_1OUT(double3, +) +DECLOP_3VAR_2IN_1OUT(double3, -) +DECLOP_3VAR_2IN_1OUT(double3, *) +DECLOP_3VAR_2IN_1OUT(double3, /) + +DECLOP_3VAR_ASSIGN(double3, +=) +DECLOP_3VAR_ASSIGN(double3, -=) +DECLOP_3VAR_ASSIGN(double3, *=) +DECLOP_3VAR_ASSIGN(double3, /=) + +DECLOP_3VAR_PREOP(double3, ++) +DECLOP_3VAR_PREOP(double3, --) + +DECLOP_3VAR_POSTOP(double3, ++) +DECLOP_3VAR_POSTOP(double3, --) + +DECLOP_3VAR_COMP(double3, ==) +DECLOP_3VAR_COMP(double3, !=) +DECLOP_3VAR_COMP(double3, <) +DECLOP_3VAR_COMP(double3, >) +DECLOP_3VAR_COMP(double3, <=) +DECLOP_3VAR_COMP(double3, >=) + +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(double3, float) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(double3, double) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed long long) + +// DOUBLE4 + +DECLOP_4VAR_2IN_1OUT(double4, +) +DECLOP_4VAR_2IN_1OUT(double4, -) +DECLOP_4VAR_2IN_1OUT(double4, *) +DECLOP_4VAR_2IN_1OUT(double4, /) + +DECLOP_4VAR_ASSIGN(double4, +=) +DECLOP_4VAR_ASSIGN(double4, -=) +DECLOP_4VAR_ASSIGN(double4, *=) +DECLOP_4VAR_ASSIGN(double4, /=) + +DECLOP_4VAR_PREOP(double4, ++) +DECLOP_4VAR_PREOP(double4, --) + +DECLOP_4VAR_POSTOP(double4, ++) +DECLOP_4VAR_POSTOP(double4, --) + +DECLOP_4VAR_COMP(double4, ==) +DECLOP_4VAR_COMP(double4, !=) +DECLOP_4VAR_COMP(double4, <) +DECLOP_4VAR_COMP(double4, >) +DECLOP_4VAR_COMP(double4, <=) +DECLOP_4VAR_COMP(double4, >=) + +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(double4, float) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(double4, double) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed long long) + +// UNSIGNED LONG1 + +DECLOP_1VAR_2IN_1OUT(ulong1, +) +DECLOP_1VAR_2IN_1OUT(ulong1, -) +DECLOP_1VAR_2IN_1OUT(ulong1, *) +DECLOP_1VAR_2IN_1OUT(ulong1, /) +DECLOP_1VAR_2IN_1OUT(ulong1, %) +DECLOP_1VAR_2IN_1OUT(ulong1, &) +DECLOP_1VAR_2IN_1OUT(ulong1, |) +DECLOP_1VAR_2IN_1OUT(ulong1, ^) +DECLOP_1VAR_2IN_1OUT(ulong1, <<) +DECLOP_1VAR_2IN_1OUT(ulong1, >>) + + +DECLOP_1VAR_ASSIGN(ulong1, +=) +DECLOP_1VAR_ASSIGN(ulong1, -=) +DECLOP_1VAR_ASSIGN(ulong1, *=) +DECLOP_1VAR_ASSIGN(ulong1, /=) +DECLOP_1VAR_ASSIGN(ulong1, %=) +DECLOP_1VAR_ASSIGN(ulong1, &=) +DECLOP_1VAR_ASSIGN(ulong1, |=) +DECLOP_1VAR_ASSIGN(ulong1, ^=) +DECLOP_1VAR_ASSIGN(ulong1, <<=) +DECLOP_1VAR_ASSIGN(ulong1, >>=) + +DECLOP_1VAR_PREOP(ulong1, ++) +DECLOP_1VAR_PREOP(ulong1, --) + +DECLOP_1VAR_POSTOP(ulong1, ++) +DECLOP_1VAR_POSTOP(ulong1, --) + +DECLOP_1VAR_COMP(ulong1, ==) +DECLOP_1VAR_COMP(ulong1, !=) +DECLOP_1VAR_COMP(ulong1, <) +DECLOP_1VAR_COMP(ulong1, >) +DECLOP_1VAR_COMP(ulong1, <=) +DECLOP_1VAR_COMP(ulong1, >=) + +DECLOP_1VAR_COMP(ulong1, &&) +DECLOP_1VAR_COMP(ulong1, ||) + +DECLOP_1VAR_1IN_1OUT(ulong1, ~) +DECLOP_1VAR_1IN_BOOLOUT(ulong1, !) + +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, float) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, double) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long long) + +// UNSIGNED LONG2 + +DECLOP_2VAR_2IN_1OUT(ulong2, +) +DECLOP_2VAR_2IN_1OUT(ulong2, -) +DECLOP_2VAR_2IN_1OUT(ulong2, *) +DECLOP_2VAR_2IN_1OUT(ulong2, /) +DECLOP_2VAR_2IN_1OUT(ulong2, %) +DECLOP_2VAR_2IN_1OUT(ulong2, &) +DECLOP_2VAR_2IN_1OUT(ulong2, |) +DECLOP_2VAR_2IN_1OUT(ulong2, ^) +DECLOP_2VAR_2IN_1OUT(ulong2, <<) +DECLOP_2VAR_2IN_1OUT(ulong2, >>) + +DECLOP_2VAR_ASSIGN(ulong2, +=) +DECLOP_2VAR_ASSIGN(ulong2, -=) +DECLOP_2VAR_ASSIGN(ulong2, *=) +DECLOP_2VAR_ASSIGN(ulong2, /=) +DECLOP_2VAR_ASSIGN(ulong2, %=) +DECLOP_2VAR_ASSIGN(ulong2, &=) +DECLOP_2VAR_ASSIGN(ulong2, |=) +DECLOP_2VAR_ASSIGN(ulong2, ^=) +DECLOP_2VAR_ASSIGN(ulong2, <<=) +DECLOP_2VAR_ASSIGN(ulong2, >>=) + +DECLOP_2VAR_PREOP(ulong2, ++) +DECLOP_2VAR_PREOP(ulong2, --) + +DECLOP_2VAR_POSTOP(ulong2, ++) +DECLOP_2VAR_POSTOP(ulong2, --) + +DECLOP_2VAR_COMP(ulong2, ==) +DECLOP_2VAR_COMP(ulong2, !=) +DECLOP_2VAR_COMP(ulong2, <) +DECLOP_2VAR_COMP(ulong2, >) +DECLOP_2VAR_COMP(ulong2, <=) +DECLOP_2VAR_COMP(ulong2, >=) + +DECLOP_2VAR_COMP(ulong2, &&) +DECLOP_2VAR_COMP(ulong2, ||) + +DECLOP_2VAR_1IN_1OUT(ulong2, ~) +DECLOP_2VAR_1IN_BOOLOUT(ulong2, !) + +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, float) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, double) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long long) + +// UNSIGNED LONG3 + +DECLOP_3VAR_2IN_1OUT(ulong3, +) +DECLOP_3VAR_2IN_1OUT(ulong3, -) +DECLOP_3VAR_2IN_1OUT(ulong3, *) +DECLOP_3VAR_2IN_1OUT(ulong3, /) +DECLOP_3VAR_2IN_1OUT(ulong3, %) +DECLOP_3VAR_2IN_1OUT(ulong3, &) +DECLOP_3VAR_2IN_1OUT(ulong3, |) +DECLOP_3VAR_2IN_1OUT(ulong3, ^) +DECLOP_3VAR_2IN_1OUT(ulong3, <<) +DECLOP_3VAR_2IN_1OUT(ulong3, >>) + +DECLOP_3VAR_ASSIGN(ulong3, +=) +DECLOP_3VAR_ASSIGN(ulong3, -=) +DECLOP_3VAR_ASSIGN(ulong3, *=) +DECLOP_3VAR_ASSIGN(ulong3, /=) +DECLOP_3VAR_ASSIGN(ulong3, %=) +DECLOP_3VAR_ASSIGN(ulong3, &=) +DECLOP_3VAR_ASSIGN(ulong3, |=) +DECLOP_3VAR_ASSIGN(ulong3, ^=) +DECLOP_3VAR_ASSIGN(ulong3, <<=) +DECLOP_3VAR_ASSIGN(ulong3, >>=) + +DECLOP_3VAR_PREOP(ulong3, ++) +DECLOP_3VAR_PREOP(ulong3, --) + +DECLOP_3VAR_POSTOP(ulong3, ++) +DECLOP_3VAR_POSTOP(ulong3, --) + +DECLOP_3VAR_COMP(ulong3, ==) +DECLOP_3VAR_COMP(ulong3, !=) +DECLOP_3VAR_COMP(ulong3, <) +DECLOP_3VAR_COMP(ulong3, >) +DECLOP_3VAR_COMP(ulong3, <=) +DECLOP_3VAR_COMP(ulong3, >=) + +DECLOP_3VAR_COMP(ulong3, &&) +DECLOP_3VAR_COMP(ulong3, ||) + +DECLOP_3VAR_1IN_1OUT(ulong3, ~) +DECLOP_3VAR_1IN_BOOLOUT(ulong3, !) + +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, float) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, double) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long long) + +// UNSIGNED LONG4 + +DECLOP_4VAR_2IN_1OUT(ulong4, +) +DECLOP_4VAR_2IN_1OUT(ulong4, -) +DECLOP_4VAR_2IN_1OUT(ulong4, *) +DECLOP_4VAR_2IN_1OUT(ulong4, /) +DECLOP_4VAR_2IN_1OUT(ulong4, %) +DECLOP_4VAR_2IN_1OUT(ulong4, &) +DECLOP_4VAR_2IN_1OUT(ulong4, |) +DECLOP_4VAR_2IN_1OUT(ulong4, ^) +DECLOP_4VAR_2IN_1OUT(ulong4, <<) +DECLOP_4VAR_2IN_1OUT(ulong4, >>) + +DECLOP_4VAR_ASSIGN(ulong4, +=) +DECLOP_4VAR_ASSIGN(ulong4, -=) +DECLOP_4VAR_ASSIGN(ulong4, *=) +DECLOP_4VAR_ASSIGN(ulong4, /=) +DECLOP_4VAR_ASSIGN(ulong4, %=) +DECLOP_4VAR_ASSIGN(ulong4, &=) +DECLOP_4VAR_ASSIGN(ulong4, |=) +DECLOP_4VAR_ASSIGN(ulong4, ^=) +DECLOP_4VAR_ASSIGN(ulong4, <<=) +DECLOP_4VAR_ASSIGN(ulong4, >>=) + +DECLOP_4VAR_PREOP(ulong4, ++) +DECLOP_4VAR_PREOP(ulong4, --) + +DECLOP_4VAR_POSTOP(ulong4, ++) +DECLOP_4VAR_POSTOP(ulong4, --) + +DECLOP_4VAR_COMP(ulong4, ==) +DECLOP_4VAR_COMP(ulong4, !=) +DECLOP_4VAR_COMP(ulong4, <) +DECLOP_4VAR_COMP(ulong4, >) +DECLOP_4VAR_COMP(ulong4, <=) +DECLOP_4VAR_COMP(ulong4, >=) + +DECLOP_4VAR_COMP(ulong4, &&) +DECLOP_4VAR_COMP(ulong4, ||) + +DECLOP_4VAR_1IN_1OUT(ulong4, ~) +DECLOP_4VAR_1IN_BOOLOUT(ulong4, !) + +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, float) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, double) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long long) + +// SIGNED LONG1 + +DECLOP_1VAR_2IN_1OUT(long1, +) +DECLOP_1VAR_2IN_1OUT(long1, -) +DECLOP_1VAR_2IN_1OUT(long1, *) +DECLOP_1VAR_2IN_1OUT(long1, /) +DECLOP_1VAR_2IN_1OUT(long1, %) +DECLOP_1VAR_2IN_1OUT(long1, &) +DECLOP_1VAR_2IN_1OUT(long1, |) +DECLOP_1VAR_2IN_1OUT(long1, ^) +DECLOP_1VAR_2IN_1OUT(long1, <<) +DECLOP_1VAR_2IN_1OUT(long1, >>) + + +DECLOP_1VAR_ASSIGN(long1, +=) +DECLOP_1VAR_ASSIGN(long1, -=) +DECLOP_1VAR_ASSIGN(long1, *=) +DECLOP_1VAR_ASSIGN(long1, /=) +DECLOP_1VAR_ASSIGN(long1, %=) +DECLOP_1VAR_ASSIGN(long1, &=) +DECLOP_1VAR_ASSIGN(long1, |=) +DECLOP_1VAR_ASSIGN(long1, ^=) +DECLOP_1VAR_ASSIGN(long1, <<=) +DECLOP_1VAR_ASSIGN(long1, >>=) + +DECLOP_1VAR_PREOP(long1, ++) +DECLOP_1VAR_PREOP(long1, --) + +DECLOP_1VAR_POSTOP(long1, ++) +DECLOP_1VAR_POSTOP(long1, --) + +DECLOP_1VAR_COMP(long1, ==) +DECLOP_1VAR_COMP(long1, !=) +DECLOP_1VAR_COMP(long1, <) +DECLOP_1VAR_COMP(long1, >) +DECLOP_1VAR_COMP(long1, <=) +DECLOP_1VAR_COMP(long1, >=) + +DECLOP_1VAR_COMP(long1, &&) +DECLOP_1VAR_COMP(long1, ||) + +DECLOP_1VAR_1IN_1OUT(long1, ~) +DECLOP_1VAR_1IN_BOOLOUT(long1, !) + +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(long1, float) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(long1, double) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed long long) + +// SIGNED LONG2 + +DECLOP_2VAR_2IN_1OUT(long2, +) +DECLOP_2VAR_2IN_1OUT(long2, -) +DECLOP_2VAR_2IN_1OUT(long2, *) +DECLOP_2VAR_2IN_1OUT(long2, /) +DECLOP_2VAR_2IN_1OUT(long2, %) +DECLOP_2VAR_2IN_1OUT(long2, &) +DECLOP_2VAR_2IN_1OUT(long2, |) +DECLOP_2VAR_2IN_1OUT(long2, ^) +DECLOP_2VAR_2IN_1OUT(long2, <<) +DECLOP_2VAR_2IN_1OUT(long2, >>) + +DECLOP_2VAR_ASSIGN(long2, +=) +DECLOP_2VAR_ASSIGN(long2, -=) +DECLOP_2VAR_ASSIGN(long2, *=) +DECLOP_2VAR_ASSIGN(long2, /=) +DECLOP_2VAR_ASSIGN(long2, %=) +DECLOP_2VAR_ASSIGN(long2, &=) +DECLOP_2VAR_ASSIGN(long2, |=) +DECLOP_2VAR_ASSIGN(long2, ^=) +DECLOP_2VAR_ASSIGN(long2, <<=) +DECLOP_2VAR_ASSIGN(long2, >>=) + +DECLOP_2VAR_PREOP(long2, ++) +DECLOP_2VAR_PREOP(long2, --) + +DECLOP_2VAR_POSTOP(long2, ++) +DECLOP_2VAR_POSTOP(long2, --) + +DECLOP_2VAR_COMP(long2, ==) +DECLOP_2VAR_COMP(long2, !=) +DECLOP_2VAR_COMP(long2, <) +DECLOP_2VAR_COMP(long2, >) +DECLOP_2VAR_COMP(long2, <=) +DECLOP_2VAR_COMP(long2, >=) + +DECLOP_2VAR_COMP(long2, &&) +DECLOP_2VAR_COMP(long2, ||) + +DECLOP_2VAR_1IN_1OUT(long2, ~) +DECLOP_2VAR_1IN_BOOLOUT(long2, !) + +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(long2, float) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(long2, double) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed long long) + +// SIGNED LONG3 + +DECLOP_3VAR_2IN_1OUT(long3, +) +DECLOP_3VAR_2IN_1OUT(long3, -) +DECLOP_3VAR_2IN_1OUT(long3, *) +DECLOP_3VAR_2IN_1OUT(long3, /) +DECLOP_3VAR_2IN_1OUT(long3, %) +DECLOP_3VAR_2IN_1OUT(long3, &) +DECLOP_3VAR_2IN_1OUT(long3, |) +DECLOP_3VAR_2IN_1OUT(long3, ^) +DECLOP_3VAR_2IN_1OUT(long3, <<) +DECLOP_3VAR_2IN_1OUT(long3, >>) + +DECLOP_3VAR_ASSIGN(long3, +=) +DECLOP_3VAR_ASSIGN(long3, -=) +DECLOP_3VAR_ASSIGN(long3, *=) +DECLOP_3VAR_ASSIGN(long3, /=) +DECLOP_3VAR_ASSIGN(long3, %=) +DECLOP_3VAR_ASSIGN(long3, &=) +DECLOP_3VAR_ASSIGN(long3, |=) +DECLOP_3VAR_ASSIGN(long3, ^=) +DECLOP_3VAR_ASSIGN(long3, <<=) +DECLOP_3VAR_ASSIGN(long3, >>=) + +DECLOP_3VAR_PREOP(long3, ++) +DECLOP_3VAR_PREOP(long3, --) + +DECLOP_3VAR_POSTOP(long3, ++) +DECLOP_3VAR_POSTOP(long3, --) + +DECLOP_3VAR_COMP(long3, ==) +DECLOP_3VAR_COMP(long3, !=) +DECLOP_3VAR_COMP(long3, <) +DECLOP_3VAR_COMP(long3, >) +DECLOP_3VAR_COMP(long3, <=) +DECLOP_3VAR_COMP(long3, >=) + +DECLOP_3VAR_COMP(long3, &&) +DECLOP_3VAR_COMP(long3, ||) + +DECLOP_3VAR_1IN_1OUT(long3, ~) +DECLOP_3VAR_1IN_BOOLOUT(long3, !) + +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(long3, float) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(long3, double) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed long long) + +// SIGNED LONG4 + +DECLOP_4VAR_2IN_1OUT(long4, +) +DECLOP_4VAR_2IN_1OUT(long4, -) +DECLOP_4VAR_2IN_1OUT(long4, *) +DECLOP_4VAR_2IN_1OUT(long4, /) +DECLOP_4VAR_2IN_1OUT(long4, %) +DECLOP_4VAR_2IN_1OUT(long4, &) +DECLOP_4VAR_2IN_1OUT(long4, |) +DECLOP_4VAR_2IN_1OUT(long4, ^) +DECLOP_4VAR_2IN_1OUT(long4, <<) +DECLOP_4VAR_2IN_1OUT(long4, >>) + +DECLOP_4VAR_ASSIGN(long4, +=) +DECLOP_4VAR_ASSIGN(long4, -=) +DECLOP_4VAR_ASSIGN(long4, *=) +DECLOP_4VAR_ASSIGN(long4, /=) +DECLOP_4VAR_ASSIGN(long4, %=) +DECLOP_4VAR_ASSIGN(long4, &=) +DECLOP_4VAR_ASSIGN(long4, |=) +DECLOP_4VAR_ASSIGN(long4, ^=) +DECLOP_4VAR_ASSIGN(long4, <<=) +DECLOP_4VAR_ASSIGN(long4, >>=) + +DECLOP_4VAR_PREOP(long4, ++) +DECLOP_4VAR_PREOP(long4, --) + +DECLOP_4VAR_POSTOP(long4, ++) +DECLOP_4VAR_POSTOP(long4, --) + +DECLOP_4VAR_COMP(long4, ==) +DECLOP_4VAR_COMP(long4, !=) +DECLOP_4VAR_COMP(long4, <) +DECLOP_4VAR_COMP(long4, >) +DECLOP_4VAR_COMP(long4, <=) +DECLOP_4VAR_COMP(long4, >=) + +DECLOP_4VAR_COMP(long4, &&) +DECLOP_4VAR_COMP(long4, ||) + +DECLOP_4VAR_1IN_1OUT(long4, ~) +DECLOP_4VAR_1IN_BOOLOUT(long4, !) + +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(long4, float) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(long4, double) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed long long) + +// UNSIGNED LONGLONG1 + +DECLOP_1VAR_2IN_1OUT(ulonglong1, +) +DECLOP_1VAR_2IN_1OUT(ulonglong1, -) +DECLOP_1VAR_2IN_1OUT(ulonglong1, *) +DECLOP_1VAR_2IN_1OUT(ulonglong1, /) +DECLOP_1VAR_2IN_1OUT(ulonglong1, %) +DECLOP_1VAR_2IN_1OUT(ulonglong1, &) +DECLOP_1VAR_2IN_1OUT(ulonglong1, |) +DECLOP_1VAR_2IN_1OUT(ulonglong1, ^) +DECLOP_1VAR_2IN_1OUT(ulonglong1, <<) +DECLOP_1VAR_2IN_1OUT(ulonglong1, >>) + + +DECLOP_1VAR_ASSIGN(ulonglong1, +=) +DECLOP_1VAR_ASSIGN(ulonglong1, -=) +DECLOP_1VAR_ASSIGN(ulonglong1, *=) +DECLOP_1VAR_ASSIGN(ulonglong1, /=) +DECLOP_1VAR_ASSIGN(ulonglong1, %=) +DECLOP_1VAR_ASSIGN(ulonglong1, &=) +DECLOP_1VAR_ASSIGN(ulonglong1, |=) +DECLOP_1VAR_ASSIGN(ulonglong1, ^=) +DECLOP_1VAR_ASSIGN(ulonglong1, <<=) +DECLOP_1VAR_ASSIGN(ulonglong1, >>=) + +DECLOP_1VAR_PREOP(ulonglong1, ++) +DECLOP_1VAR_PREOP(ulonglong1, --) + +DECLOP_1VAR_POSTOP(ulonglong1, ++) +DECLOP_1VAR_POSTOP(ulonglong1, --) + +DECLOP_1VAR_COMP(ulonglong1, ==) +DECLOP_1VAR_COMP(ulonglong1, !=) +DECLOP_1VAR_COMP(ulonglong1, <) +DECLOP_1VAR_COMP(ulonglong1, >) +DECLOP_1VAR_COMP(ulonglong1, <=) +DECLOP_1VAR_COMP(ulonglong1, >=) + +DECLOP_1VAR_COMP(ulonglong1, &&) +DECLOP_1VAR_COMP(ulonglong1, ||) + +DECLOP_1VAR_1IN_1OUT(ulonglong1, ~) +DECLOP_1VAR_1IN_BOOLOUT(ulonglong1, !) + +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, float) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, double) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long long) + +// UNSIGNED LONGLONG2 + +DECLOP_2VAR_2IN_1OUT(ulonglong2, +) +DECLOP_2VAR_2IN_1OUT(ulonglong2, -) +DECLOP_2VAR_2IN_1OUT(ulonglong2, *) +DECLOP_2VAR_2IN_1OUT(ulonglong2, /) +DECLOP_2VAR_2IN_1OUT(ulonglong2, %) +DECLOP_2VAR_2IN_1OUT(ulonglong2, &) +DECLOP_2VAR_2IN_1OUT(ulonglong2, |) +DECLOP_2VAR_2IN_1OUT(ulonglong2, ^) +DECLOP_2VAR_2IN_1OUT(ulonglong2, <<) +DECLOP_2VAR_2IN_1OUT(ulonglong2, >>) + +DECLOP_2VAR_ASSIGN(ulonglong2, +=) +DECLOP_2VAR_ASSIGN(ulonglong2, -=) +DECLOP_2VAR_ASSIGN(ulonglong2, *=) +DECLOP_2VAR_ASSIGN(ulonglong2, /=) +DECLOP_2VAR_ASSIGN(ulonglong2, %=) +DECLOP_2VAR_ASSIGN(ulonglong2, &=) +DECLOP_2VAR_ASSIGN(ulonglong2, |=) +DECLOP_2VAR_ASSIGN(ulonglong2, ^=) +DECLOP_2VAR_ASSIGN(ulonglong2, <<=) +DECLOP_2VAR_ASSIGN(ulonglong2, >>=) + +DECLOP_2VAR_PREOP(ulonglong2, ++) +DECLOP_2VAR_PREOP(ulonglong2, --) + +DECLOP_2VAR_POSTOP(ulonglong2, ++) +DECLOP_2VAR_POSTOP(ulonglong2, --) + +DECLOP_2VAR_COMP(ulonglong2, ==) +DECLOP_2VAR_COMP(ulonglong2, !=) +DECLOP_2VAR_COMP(ulonglong2, <) +DECLOP_2VAR_COMP(ulonglong2, >) +DECLOP_2VAR_COMP(ulonglong2, <=) +DECLOP_2VAR_COMP(ulonglong2, >=) + +DECLOP_2VAR_COMP(ulonglong2, &&) +DECLOP_2VAR_COMP(ulonglong2, ||) + +DECLOP_2VAR_1IN_1OUT(ulonglong2, ~) +DECLOP_2VAR_1IN_BOOLOUT(ulonglong2, !) + +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, float) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, double) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long long) + +// UNSIGNED LONGLONG3 + +DECLOP_3VAR_2IN_1OUT(ulonglong3, +) +DECLOP_3VAR_2IN_1OUT(ulonglong3, -) +DECLOP_3VAR_2IN_1OUT(ulonglong3, *) +DECLOP_3VAR_2IN_1OUT(ulonglong3, /) +DECLOP_3VAR_2IN_1OUT(ulonglong3, %) +DECLOP_3VAR_2IN_1OUT(ulonglong3, &) +DECLOP_3VAR_2IN_1OUT(ulonglong3, |) +DECLOP_3VAR_2IN_1OUT(ulonglong3, ^) +DECLOP_3VAR_2IN_1OUT(ulonglong3, <<) +DECLOP_3VAR_2IN_1OUT(ulonglong3, >>) + +DECLOP_3VAR_ASSIGN(ulonglong3, +=) +DECLOP_3VAR_ASSIGN(ulonglong3, -=) +DECLOP_3VAR_ASSIGN(ulonglong3, *=) +DECLOP_3VAR_ASSIGN(ulonglong3, /=) +DECLOP_3VAR_ASSIGN(ulonglong3, %=) +DECLOP_3VAR_ASSIGN(ulonglong3, &=) +DECLOP_3VAR_ASSIGN(ulonglong3, |=) +DECLOP_3VAR_ASSIGN(ulonglong3, ^=) +DECLOP_3VAR_ASSIGN(ulonglong3, <<=) +DECLOP_3VAR_ASSIGN(ulonglong3, >>=) + +DECLOP_3VAR_PREOP(ulonglong3, ++) +DECLOP_3VAR_PREOP(ulonglong3, --) + +DECLOP_3VAR_POSTOP(ulonglong3, ++) +DECLOP_3VAR_POSTOP(ulonglong3, --) + +DECLOP_3VAR_COMP(ulonglong3, ==) +DECLOP_3VAR_COMP(ulonglong3, !=) +DECLOP_3VAR_COMP(ulonglong3, <) +DECLOP_3VAR_COMP(ulonglong3, >) +DECLOP_3VAR_COMP(ulonglong3, <=) +DECLOP_3VAR_COMP(ulonglong3, >=) + +DECLOP_3VAR_COMP(ulonglong3, &&) +DECLOP_3VAR_COMP(ulonglong3, ||) + +DECLOP_3VAR_1IN_1OUT(ulonglong3, ~) +DECLOP_3VAR_1IN_BOOLOUT(ulonglong3, !) + +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, float) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, double) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long long) + +// UNSIGNED LONGLONG4 + +DECLOP_4VAR_2IN_1OUT(ulonglong4, +) +DECLOP_4VAR_2IN_1OUT(ulonglong4, -) +DECLOP_4VAR_2IN_1OUT(ulonglong4, *) +DECLOP_4VAR_2IN_1OUT(ulonglong4, /) +DECLOP_4VAR_2IN_1OUT(ulonglong4, %) +DECLOP_4VAR_2IN_1OUT(ulonglong4, &) +DECLOP_4VAR_2IN_1OUT(ulonglong4, |) +DECLOP_4VAR_2IN_1OUT(ulonglong4, ^) +DECLOP_4VAR_2IN_1OUT(ulonglong4, <<) +DECLOP_4VAR_2IN_1OUT(ulonglong4, >>) + +DECLOP_4VAR_ASSIGN(ulonglong4, +=) +DECLOP_4VAR_ASSIGN(ulonglong4, -=) +DECLOP_4VAR_ASSIGN(ulonglong4, *=) +DECLOP_4VAR_ASSIGN(ulonglong4, /=) +DECLOP_4VAR_ASSIGN(ulonglong4, %=) +DECLOP_4VAR_ASSIGN(ulonglong4, &=) +DECLOP_4VAR_ASSIGN(ulonglong4, |=) +DECLOP_4VAR_ASSIGN(ulonglong4, ^=) +DECLOP_4VAR_ASSIGN(ulonglong4, <<=) +DECLOP_4VAR_ASSIGN(ulonglong4, >>=) + +DECLOP_4VAR_PREOP(ulonglong4, ++) +DECLOP_4VAR_PREOP(ulonglong4, --) + +DECLOP_4VAR_POSTOP(ulonglong4, ++) +DECLOP_4VAR_POSTOP(ulonglong4, --) + +DECLOP_4VAR_COMP(ulonglong4, ==) +DECLOP_4VAR_COMP(ulonglong4, !=) +DECLOP_4VAR_COMP(ulonglong4, <) +DECLOP_4VAR_COMP(ulonglong4, >) +DECLOP_4VAR_COMP(ulonglong4, <=) +DECLOP_4VAR_COMP(ulonglong4, >=) + +DECLOP_4VAR_COMP(ulonglong4, &&) +DECLOP_4VAR_COMP(ulonglong4, ||) + +DECLOP_4VAR_1IN_1OUT(ulonglong4, ~) +DECLOP_4VAR_1IN_BOOLOUT(ulonglong4, !) + +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, float) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, double) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long long) + +// SIGNED LONGLONG1 + +DECLOP_1VAR_2IN_1OUT(longlong1, +) +DECLOP_1VAR_2IN_1OUT(longlong1, -) +DECLOP_1VAR_2IN_1OUT(longlong1, *) +DECLOP_1VAR_2IN_1OUT(longlong1, /) +DECLOP_1VAR_2IN_1OUT(longlong1, %) +DECLOP_1VAR_2IN_1OUT(longlong1, &) +DECLOP_1VAR_2IN_1OUT(longlong1, |) +DECLOP_1VAR_2IN_1OUT(longlong1, ^) +DECLOP_1VAR_2IN_1OUT(longlong1, <<) +DECLOP_1VAR_2IN_1OUT(longlong1, >>) + + +DECLOP_1VAR_ASSIGN(longlong1, +=) +DECLOP_1VAR_ASSIGN(longlong1, -=) +DECLOP_1VAR_ASSIGN(longlong1, *=) +DECLOP_1VAR_ASSIGN(longlong1, /=) +DECLOP_1VAR_ASSIGN(longlong1, %=) +DECLOP_1VAR_ASSIGN(longlong1, &=) +DECLOP_1VAR_ASSIGN(longlong1, |=) +DECLOP_1VAR_ASSIGN(longlong1, ^=) +DECLOP_1VAR_ASSIGN(longlong1, <<=) +DECLOP_1VAR_ASSIGN(longlong1, >>=) + +DECLOP_1VAR_PREOP(longlong1, ++) +DECLOP_1VAR_PREOP(longlong1, --) + +DECLOP_1VAR_POSTOP(longlong1, ++) +DECLOP_1VAR_POSTOP(longlong1, --) + +DECLOP_1VAR_COMP(longlong1, ==) +DECLOP_1VAR_COMP(longlong1, !=) +DECLOP_1VAR_COMP(longlong1, <) +DECLOP_1VAR_COMP(longlong1, >) +DECLOP_1VAR_COMP(longlong1, <=) +DECLOP_1VAR_COMP(longlong1, >=) + +DECLOP_1VAR_COMP(longlong1, &&) +DECLOP_1VAR_COMP(longlong1, ||) + +DECLOP_1VAR_1IN_1OUT(longlong1, ~) +DECLOP_1VAR_1IN_BOOLOUT(longlong1, !) + +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, float) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, double) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long long) + +// SIGNED LONGLONG2 + +DECLOP_2VAR_2IN_1OUT(longlong2, +) +DECLOP_2VAR_2IN_1OUT(longlong2, -) +DECLOP_2VAR_2IN_1OUT(longlong2, *) +DECLOP_2VAR_2IN_1OUT(longlong2, /) +DECLOP_2VAR_2IN_1OUT(longlong2, %) +DECLOP_2VAR_2IN_1OUT(longlong2, &) +DECLOP_2VAR_2IN_1OUT(longlong2, |) +DECLOP_2VAR_2IN_1OUT(longlong2, ^) +DECLOP_2VAR_2IN_1OUT(longlong2, <<) +DECLOP_2VAR_2IN_1OUT(longlong2, >>) + +DECLOP_2VAR_ASSIGN(longlong2, +=) +DECLOP_2VAR_ASSIGN(longlong2, -=) +DECLOP_2VAR_ASSIGN(longlong2, *=) +DECLOP_2VAR_ASSIGN(longlong2, /=) +DECLOP_2VAR_ASSIGN(longlong2, %=) +DECLOP_2VAR_ASSIGN(longlong2, &=) +DECLOP_2VAR_ASSIGN(longlong2, |=) +DECLOP_2VAR_ASSIGN(longlong2, ^=) +DECLOP_2VAR_ASSIGN(longlong2, <<=) +DECLOP_2VAR_ASSIGN(longlong2, >>=) + +DECLOP_2VAR_PREOP(longlong2, ++) +DECLOP_2VAR_PREOP(longlong2, --) + +DECLOP_2VAR_POSTOP(longlong2, ++) +DECLOP_2VAR_POSTOP(longlong2, --) + +DECLOP_2VAR_COMP(longlong2, ==) +DECLOP_2VAR_COMP(longlong2, !=) +DECLOP_2VAR_COMP(longlong2, <) +DECLOP_2VAR_COMP(longlong2, >) +DECLOP_2VAR_COMP(longlong2, <=) +DECLOP_2VAR_COMP(longlong2, >=) + +DECLOP_2VAR_COMP(longlong2, &&) +DECLOP_2VAR_COMP(longlong2, ||) + +DECLOP_2VAR_1IN_1OUT(longlong2, ~) +DECLOP_2VAR_1IN_BOOLOUT(longlong2, !) + +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, float) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, double) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long long) + +// SIGNED LONGLONG3 + +DECLOP_3VAR_2IN_1OUT(longlong3, +) +DECLOP_3VAR_2IN_1OUT(longlong3, -) +DECLOP_3VAR_2IN_1OUT(longlong3, *) +DECLOP_3VAR_2IN_1OUT(longlong3, /) +DECLOP_3VAR_2IN_1OUT(longlong3, %) +DECLOP_3VAR_2IN_1OUT(longlong3, &) +DECLOP_3VAR_2IN_1OUT(longlong3, |) +DECLOP_3VAR_2IN_1OUT(longlong3, ^) +DECLOP_3VAR_2IN_1OUT(longlong3, <<) +DECLOP_3VAR_2IN_1OUT(longlong3, >>) + +DECLOP_3VAR_ASSIGN(longlong3, +=) +DECLOP_3VAR_ASSIGN(longlong3, -=) +DECLOP_3VAR_ASSIGN(longlong3, *=) +DECLOP_3VAR_ASSIGN(longlong3, /=) +DECLOP_3VAR_ASSIGN(longlong3, %=) +DECLOP_3VAR_ASSIGN(longlong3, &=) +DECLOP_3VAR_ASSIGN(longlong3, |=) +DECLOP_3VAR_ASSIGN(longlong3, ^=) +DECLOP_3VAR_ASSIGN(longlong3, <<=) +DECLOP_3VAR_ASSIGN(longlong3, >>=) + +DECLOP_3VAR_PREOP(longlong3, ++) +DECLOP_3VAR_PREOP(longlong3, --) + +DECLOP_3VAR_POSTOP(longlong3, ++) +DECLOP_3VAR_POSTOP(longlong3, --) + +DECLOP_3VAR_COMP(longlong3, ==) +DECLOP_3VAR_COMP(longlong3, !=) +DECLOP_3VAR_COMP(longlong3, <) +DECLOP_3VAR_COMP(longlong3, >) +DECLOP_3VAR_COMP(longlong3, <=) +DECLOP_3VAR_COMP(longlong3, >=) + +DECLOP_3VAR_COMP(longlong3, &&) +DECLOP_3VAR_COMP(longlong3, ||) + +DECLOP_3VAR_1IN_1OUT(longlong3, ~) +DECLOP_3VAR_1IN_BOOLOUT(longlong3, !) + +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, float) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, double) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long long) + +// SIGNED LONGLONG4 + +DECLOP_4VAR_2IN_1OUT(longlong4, +) +DECLOP_4VAR_2IN_1OUT(longlong4, -) +DECLOP_4VAR_2IN_1OUT(longlong4, *) +DECLOP_4VAR_2IN_1OUT(longlong4, /) +DECLOP_4VAR_2IN_1OUT(longlong4, %) +DECLOP_4VAR_2IN_1OUT(longlong4, &) +DECLOP_4VAR_2IN_1OUT(longlong4, |) +DECLOP_4VAR_2IN_1OUT(longlong4, ^) +DECLOP_4VAR_2IN_1OUT(longlong4, <<) +DECLOP_4VAR_2IN_1OUT(longlong4, >>) + +DECLOP_4VAR_ASSIGN(longlong4, +=) +DECLOP_4VAR_ASSIGN(longlong4, -=) +DECLOP_4VAR_ASSIGN(longlong4, *=) +DECLOP_4VAR_ASSIGN(longlong4, /=) +DECLOP_4VAR_ASSIGN(longlong4, %=) +DECLOP_4VAR_ASSIGN(longlong4, &=) +DECLOP_4VAR_ASSIGN(longlong4, |=) +DECLOP_4VAR_ASSIGN(longlong4, ^=) +DECLOP_4VAR_ASSIGN(longlong4, <<=) +DECLOP_4VAR_ASSIGN(longlong4, >>=) + +DECLOP_4VAR_PREOP(longlong4, ++) +DECLOP_4VAR_PREOP(longlong4, --) + +DECLOP_4VAR_POSTOP(longlong4, ++) +DECLOP_4VAR_POSTOP(longlong4, --) + +DECLOP_4VAR_COMP(longlong4, ==) +DECLOP_4VAR_COMP(longlong4, !=) +DECLOP_4VAR_COMP(longlong4, <) +DECLOP_4VAR_COMP(longlong4, >) +DECLOP_4VAR_COMP(longlong4, <=) +DECLOP_4VAR_COMP(longlong4, >=) + +DECLOP_4VAR_COMP(longlong4, &&) +DECLOP_4VAR_COMP(longlong4, ||) + +DECLOP_4VAR_1IN_1OUT(longlong4, ~) +DECLOP_4VAR_1IN_BOOLOUT(longlong4, !) + +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, float) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, double) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long long) #endif From 270f643c9c60b1f6b45f74ef094e83706769f2c7 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 19 May 2017 17:22:14 +0300 Subject: [PATCH 093/171] [HIP] [HIPIFY] [FIX] cuModuleLoadDataEx -> hipModuleLoadDataEx https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/81 1. Do not use JIT options on HCC path, call hipModuleLoadData instead. 2. NVCC path is unchanged, to call cuModuleLoadDataEx with all options. 3. Get rid of manual hipification, based on #ifdef #else for NVCC/HIP. 4. Update documentation accordingly. --- docs/markdown/hip_porting_driver_api.md | 65 +++++++++++++---------- hipify-clang/src/Cuda2Hip.cpp | 38 ++++++------- include/hip/hcc_detail/hip_runtime_api.h | 19 +++++-- include/hip/hip_runtime_api.h | 24 +++++++++ include/hip/nvcc_detail/hip_runtime_api.h | 44 +++++++++++---- src/hip_module.cpp | 6 ++- 6 files changed, 134 insertions(+), 62 deletions(-) diff --git a/docs/markdown/hip_porting_driver_api.md b/docs/markdown/hip_porting_driver_api.md index dd3b9c3e86..0912e676cc 100644 --- a/docs/markdown/hip_porting_driver_api.md +++ b/docs/markdown/hip_porting_driver_api.md @@ -98,48 +98,57 @@ HIP/HCC will push primary context to context stack when it is empty. This can ha #### Interoperation between HIP and CUDA Driver CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction. -|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| -| ---- | ---- | ---- | -| hipModule | CUmodule | | -| hipFunction | CUfunction | | -| hipCtx_t | CUcontext | | -| hipDevice_t | CUdevice | | -| hipStream_t | CUstream | cudaStream_t | -| hipEvent_t | CUevent | cudaEvent_t | -| hipArray | CUarray | cudaArray | - -#### Compilation Flags -The hipModule interface does not support the `cuModuleLoadDataEx` function, which is used to control PTX compilation options. -HCC does not use PTX and does not support the same compilation options. -In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step. -Code which requires this functionally should use platform-specific coding, calling `cuModuleLoadDataEx` on the NVCC path and `hipModuleLoadData` on the hcc path. -For example: +|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| +| ---- | ---- | ---- | +| hipModule_t | CUmodule | | +| hipFunction_t | CUfunction | | +| hipCtx_t | CUcontext | | +| hipDevice_t | CUdevice | | +| hipStream_t | CUstream | cudaStream_t | +| hipEvent_t | CUevent | cudaEvent_t | +| hipArray | CUarray | cudaArray | +#### Compilation Options +The `hipModule_t` interface does not support `cuModuleLoadDataEx` function, which is used to control PTX compilation options. +HCC does not use PTX and does not support these compilation options. +In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as a part of the load step. +The corresponding HIP function `hipModuleLoadDataEx` behaves as `hipModuleLoadData` on HCC path (compilation options are not used) and as `cuModuleLoadDataEx` on NVCC path. +For example (CUDA): ``` -hipModule module; -void *imagePtr = ... ; // Somehow populate data pointer with code object +CUmodule module; +void *imagePtr = ...; // Somehow populate data pointer with code object -#ifdef __HIP_PLATFORM_NVCC__ -// Use CUDA driver API but write to hipModule since they are same type: const int numOptions = 1; CUJit_option options[numOptions]; void * optionValues[numOptions]; options[0] = CU_JIT_MAX_REGISTERS; -unsigned maxRegs=15; -optionValues[0] = (void*) (&maxRegs); +unsigned maxRegs = 15; +optionValues[0] = (void*)(&maxRegs); cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); -#else // __HIP_PLATFORM_HCC__ +CUfunction k; +cuModuleGetFunction(&k, module, "myKernel"); +``` +HIP: +``` +hipModule_t module; +void *imagePtr = ...; // Somehow populate data pointer with code object -// HCC path does not support or require JIT options, so just load the module. -hipModuleLoadData(&module, imagePtr); +const int numOptions = 1; +hipJitOption options[numOptions]; +void * optionValues[numOptions]; -#endif +options[0] = hipJitOptionMaxRegisters; +unsigned maxRegs = 15; +optionValues[0] = (void*)(&maxRegs); -// Back to unified code - both paths above loaded the "module" variable. -hipFunction k; +// hipModuleLoadData(module, imagePtr) will be called on HCC path, JIT options will not be used, and +// cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path +hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); + +hipFunction_t k; hipModuleGetFunction(&k, module, "myKernel"); ``` diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 0c6b0f1efc..e07baab3fd 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -678,24 +678,24 @@ struct cuda2hipMap { cuda2hipRename["CU_PREFER_PTX"] = {"hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_PREFER_BINARY"] = {"hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // enum CUjit_option/CUjit_option_enum - cuda2hipRename["CUjit_option"] = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) - cuda2hipRename["CUjit_option_enum"] = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_MAX_REGISTERS"] = {"hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_THREADS_PER_BLOCK"] = {"hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_WALL_TIME"] = {"hipJitOptionWallTime", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_INFO_LOG_BUFFER"] = {"hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER"] = {"hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_OPTIMIZATION_LEVEL"] = {"hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_TARGET_FROM_CUCONTEXT"] = {"hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_TARGET"] = {"hipJitOptionTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_FALLBACK_STRATEGY"] = {"hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_GENERATE_DEBUG_INFO"] = {"hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUjit_option"] = {"hipJitOption", CONV_JIT, API_DRIVER}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_option_enum"] = {"hipJitOption", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_MAX_REGISTERS"] = {"hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_THREADS_PER_BLOCK"] = {"hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_WALL_TIME"] = {"hipJitOptionWallTime", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_INFO_LOG_BUFFER"] = {"hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER"] = {"hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_OPTIMIZATION_LEVEL"] = {"hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_TARGET_FROM_CUCONTEXT"] = {"hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_TARGET"] = {"hipJitOptionTarget", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_FALLBACK_STRATEGY"] = {"hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_GENERATE_DEBUG_INFO"] = {"hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER}; // enum CUjit_target/CUjit_target_enum cuda2hipRename["CUjit_target"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) cuda2hipRename["CUjit_target_enum"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; @@ -905,7 +905,7 @@ struct cuda2hipMap { cuda2hipRename["cuModuleLoad"] = {"hipModuleLoad", CONV_MODULE, API_DRIVER}; cuda2hipRename["cuModuleLoadData"] = {"hipModuleLoadData", CONV_MODULE, API_DRIVER}; // unsupported yet by HIP - cuda2hipRename["cuModuleLoadDataEx"] = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleLoadDataEx"] = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER}; cuda2hipRename["cuModuleLoadFatBinary"] = {"hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuModuleUnload"] = {"hipModuleUnload", CONV_MODULE, API_DRIVER}; diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index e1aecef1e8..34ed2ed5ce 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -1915,7 +1915,7 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con * @brief returns device memory pointer and size of the kernel present in the module with symbol @p name * * @param [out] dptr - * @param [out[ bytes + * @param [out] bytes * @param [in] hmod * @param [in] name * @@ -1923,7 +1923,6 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con */ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name); - /** * @brief builds module from code object which resides in host memory. Image is pointer to that location. * @@ -1934,11 +1933,23 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t h */ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); +/** +* @brief builds module from code object which resides in host memory. Image is pointer to that location. Options are not used. hipModuleLoadData is called. +* +* @param [in] image +* @param [out] module +* @param [in] number of options +* @param [in] options for JIT +* @param [in] option values for JIT +* +* @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized +*/ +hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues); /** * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra * - * @param [in[ f Kernel to launch. + * @param [in] f Kernel to launch. * @param [in] gridDimX X grid dimension specified as multiple of blockDimX. * @param [in] gridDimY Y grid dimension specified as multiple of blockDimY. * @param [in] gridDimZ Z grid dimension specified as multiple of blockDimZ. @@ -1946,7 +1957,7 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); * @param [in] blockDimY Y grid dimension specified in work-items * @param [in] blockDimZ Z grid dimension specified in work-items * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED. - * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. + * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. * @param [in] kernelParams * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. * diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index 8eae1d6a3a..fa54dda5dc 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -250,6 +250,30 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. } hipDeviceAttribute_t; +/* +* @brief hipJitOption +* @enum +* @ingroup Enumerations +*/ +typedef enum hipJitOption { + hipJitOptionMaxRegisters = 0, + hipJitOptionThreadsPerBlock, + hipJitOptionWallTime, + hipJitOptionInfoLogBuffer, + hipJitOptionInfoLogBufferSizeBytes, + hipJitOptionErrorLogBuffer, + hipJitOptionErrorLogBufferSizeBytes, + hipJitOptionOptimizationLevel, + hipJitOptionTargetFromContext, + hipJitOptionTarget, + hipJitOptionFallbackStrategy, + hipJitOptionGenerateDebugInfo, + hipJitOptionLogVerbose, + hipJitOptionGenerateLineInfo, + hipJitOptionCacheMode, + hipJitOptionNumOptions +} hipJitOption; + /** * @} */ diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index 69a9b46570..01a93f7ba4 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -54,26 +54,44 @@ hipMemcpyHostToHost #define hipFilterModePoint cudaFilterModePoint //! Flags that can be used with hipEventCreateWithFlags: -#define hipEventDefault cudaEventDefault -#define hipEventBlockingSync cudaEventBlockingSync -#define hipEventDisableTiming cudaEventDisableTiming -#define hipEventInterprocess cudaEventInterprocess +#define hipEventDefault cudaEventDefault +#define hipEventBlockingSync cudaEventBlockingSync +#define hipEventDisableTiming cudaEventDisableTiming +#define hipEventInterprocess cudaEventInterprocess #define hipEventDisableSystemRelease cudaEventDefault /* no-op on CUDA platform */ -#define hipHostMallocDefault cudaHostAllocDefault -#define hipHostMallocPortable cudaHostAllocPortable -#define hipHostMallocMapped cudaHostAllocMapped +#define hipHostMallocDefault cudaHostAllocDefault +#define hipHostMallocPortable cudaHostAllocPortable +#define hipHostMallocMapped cudaHostAllocMapped #define hipHostMallocWriteCombined cudaHostAllocWriteCombined #define hipHostRegisterPortable cudaHostRegisterPortable -#define hipHostRegisterMapped cudaHostRegisterMapped +#define hipHostRegisterMapped cudaHostRegisterMapped #define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER -#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE +#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE #define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END #define hipLimitMallocHeapSize cudaLimitMallocHeapSize -#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess +#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess + +// enum CUjit_option redefines +#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS +#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK +#define hipJitOptionWallTime CU_JIT_WALL_TIME +#define hipJitOptionInfoLogBuffer CU_JIT_INFO_LOG_BUFFER +#define hipJitOptionInfoLogBufferSizeBytes CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES +#define hipJitOptionErrorLogBuffer CU_JIT_ERROR_LOG_BUFFER +#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES +#define hipJitOptionOptimizationLevel CU_JIT_OPTIMIZATION_LEVEL +#define hipJitOptionTargetFromContext CU_JIT_TARGET_FROM_CUCONTEXT +#define hipJitOptionTarget CU_JIT_TARGET +#define hipJitOptionFallbackStrategy CU_JIT_FALLBACK_STRATEGY +#define hipJitOptionGenerateDebugInfo CU_JIT_GENERATE_DEBUG_INFO +#define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE +#define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO +#define hipJitOptionCacheMode CU_JIT_CACHE_MODE +#define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS typedef cudaEvent_t hipEvent_t; typedef cudaStream_t hipStream_t; @@ -84,6 +102,7 @@ typedef cudaFuncCache hipFuncCache_t; typedef CUcontext hipCtx_t; typedef CUsharedconfig hipSharedMemConfig; typedef CUfunc_cache hipFuncCache; +typedef CUjit_option hipJitOption; typedef CUdevice hipDevice_t; typedef CUmodule hipModule_t; typedef CUfunction hipFunction_t; @@ -894,6 +913,11 @@ inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *imag return hipCUResultTohipError(cuModuleLoadData(module, image)); } +inline static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues) +{ + return hipCUResultTohipError(cuModuleLoadDataEx(module, image, numOptions, options, optionValues)); +} + inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, diff --git a/src/hip_module.cpp b/src/hip_module.cpp index da01f23769..d364a6b519 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -525,7 +525,6 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, } } - hipError_t hipModuleLoadData(hipModule_t *module, const void *image) { HIP_INIT_API(module, image); @@ -575,3 +574,8 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image) } return ihipLogStatus(ret); } + +hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues) +{ + return hipModuleLoadData(module, image); +} From b5a1d47e68e54bfd82df38199f436ece116992bb Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 19 May 2017 17:39:09 +0300 Subject: [PATCH 094/171] [HIPIFY] [FIX] [HIPIFY] Matcher for pointer to enum var declaration is missing. https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/79 Example from CUDA 8.0.44 sample (CUDASamples\0_Simple\matrixMulDrv\matrixMulDrv.cpp): CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; where CUjit_option is enum, should be: hipJitOption *jitOptions = new hipJitOption[jitNumOptions]; [TODO] 1. new CUjit_option -> new hipJitOption. Matcher for new operator is missing: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/79 2. Merge matchers cudaEnumDecl and cudaEnumVarPtr. --- hipify-clang/src/Cuda2Hip.cpp | 51 +++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index e07baab3fd..34d3d6e24f 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -2905,6 +2905,51 @@ private: return false; } + bool cudaEnumVarPtr(const MatchFinder::MatchResult &Result) { + if (const VarDecl *enumVarPtr = Result.Nodes.getNodeAs("cudaEnumVarPtr")) { + const Type *t = enumVarPtr->getType().getTypePtrOrNull(); + if (t) { + QualType QT = t->getPointeeType(); + std::string name = QT.getAsString(); + QT = enumVarPtr->getType().getUnqualifiedType(); + std::string name_unqualified = QT.getAsString(); + if ((name_unqualified.find(' ') == std::string::npos && name.find(' ') == std::string::npos) || name.empty()) { + name = name_unqualified; + } + // Workaround for enum VarDecl as param decl, declared with enum type specifier + // Example: void func(enum cudaMemcpyKind kind); + //------------------------------------------------- + SourceManager *SM = Result.SourceManager; + TypeLoc TL = enumVarPtr->getTypeSourceInfo()->getTypeLoc(); + SourceLocation sl(TL.getUnqualifiedLoc().getLocStart()); + SourceLocation end(TL.getUnqualifiedLoc().getLocEnd()); + size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl); + StringRef sfull = StringRef(SM->getCharacterData(sl), repLength); + size_t offset = sfull.find(name); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + //------------------------------------------------- + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + updateCounters(found->second, name); + if (!found->second.unsupported) { + StringRef repName = found->second.hipName; + Replacement Rep(*SM, sl, name.size(), repName); + FullSourceLoc fullSL(sl, *SM); + insertReplacement(Rep, fullSL); + } + } + else { + std::string msg = "the following reference is not handled: '" + name + "' [enum var ptr]."; + printHipifyMessage(*SM, sl, msg); + } + } + return true; + } + return false; + } + bool cudaTypedefVar(const MatchFinder::MatchResult &Result) { if (const VarDecl *typedefVar = Result.Nodes.getNodeAs("cudaTypedefVar")) { QualType QT = typedefVar->getType(); @@ -3185,6 +3230,7 @@ public: if (cudaBuiltin(Result)) break; if (cudaEnumConstantRef(Result)) break; if (cudaEnumDecl(Result)) break; + if (cudaEnumVarPtr(Result)) break; if (cudaTypedefVar(Result)) break; if (cudaTypedefVarPtr(Result)) break; if (cudaStructVar(Result)) break; @@ -3232,6 +3278,11 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac hasType(enumDecl())) .bind("cudaEnumDecl"), Callback); + Finder.addMatcher(varDecl(isExpansionInMainFile(), + hasType(pointsTo(enumDecl( + matchesName("cu.*|CU.*"))))) + .bind("cudaEnumVarPtr"), + Callback); Finder.addMatcher(varDecl(isExpansionInMainFile(), hasType(typedefDecl(matchesName("cu.*|CU.*")))) .bind("cudaTypedefVar"), From 6d0f58b93988a03eab45ec78bdf20a280b917288 Mon Sep 17 00:00:00 2001 From: pensun Date: Mon, 22 May 2017 08:52:43 -0500 Subject: [PATCH 095/171] fix GGL helper header file, reorder for C++17 Change-Id: I3d9ddfe670bf7e3e8e7bd85e52cc61f48c19c213 --- include/hip/hcc_detail/helpers.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/hip/hcc_detail/helpers.hpp b/include/hip/hcc_detail/helpers.hpp index 611929766b..b5502c1efb 100644 --- a/include/hip/hcc_detail/helpers.hpp +++ b/include/hip/hcc_detail/helpers.hpp @@ -102,9 +102,6 @@ namespace hip_impl // Not callable. template struct is_callable_impl : std::false_type {}; - - template - struct is_callable : is_callable_impl {}; #else template struct is_callable_impl : std::false_type {}; @@ -114,6 +111,8 @@ namespace hip_impl F(Ts...), void_t_>> : std::true_type {}; #endif + template + struct is_callable : is_callable_impl {}; #define count_macro_args_impl_hip_(\ _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15,\ From 0559fc69e951b8547f9817ba1b2a9053626dea95 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 23 May 2017 09:32:19 -0500 Subject: [PATCH 096/171] fixed erfinv build error as it is implemented in hcc Change-Id: I27a512147c53f658a63fdf3e90f5e9cfac09ada8 --- src/math_functions.cpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/math_functions.cpp b/src/math_functions.cpp index 3472216309..f1e52c4036 100644 --- a/src/math_functions.cpp +++ b/src/math_functions.cpp @@ -830,16 +830,6 @@ __host__ double erfcinv(double y) return __hip_host_erfcinv(y); } -__host__ float erfinvf(float x) -{ - return __hip_host_erfinvf(x); -} - -__host__ double erfinv(double x) -{ - return __hip_host_erfinv(x); -} - __host__ double fdivide(double x, double y) { return x/y; @@ -949,7 +939,7 @@ __host__ void sincospi(double x, double *sptr, double *cptr) __host__ float normcdfinvf(float x) { - return std::sqrt(2) * erfinv(2*x-1); + return std::sqrt(2) * erfinvf(2*x-1); } __host__ double normcdfinv(double x) From 3d973dc4da7aedc0becd13523b9ee2f4e3f3d989 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 23 May 2017 19:45:38 +0300 Subject: [PATCH 097/171] [FIX] [HIPIFY] Matcher for new operator is missing. https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/80 Example from CUDA 8.0.44 sample (CUDASamples\0_Simple\matrixMulDrv\matrixMulDrv.cpp): CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; where CUjit_option is enum, should be: hipJitOption *jitOptions = new hipJitOption[jitNumOptions]; --- hipify-clang/src/Cuda2Hip.cpp | 37 +++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 34d3d6e24f..dcb9c3d216 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -3095,6 +3095,35 @@ private: return false; } + bool cudaNewOperatorDecl(const MatchFinder::MatchResult &Result) { + if (const auto *newOperator = Result.Nodes.getNodeAs("cudaNewOperatorDecl")) { + const Type *t = newOperator->getType().getTypePtrOrNull(); + if (t) { + SourceManager *SM = Result.SourceManager; + TypeLoc TL = newOperator->getAllocatedTypeSourceInfo()->getTypeLoc(); + SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); + QualType QT = t->getPointeeType(); + std::string name = QT.getAsString(); + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + updateCounters(found->second, name); + if (!found->second.unsupported) { + StringRef repName = found->second.hipName; + Replacement Rep(*SM, sl, name.size(), repName); + FullSourceLoc fullSL(sl, *SM); + insertReplacement(Rep, fullSL); + } + } + else { + std::string msg = "the following reference is not handled: '" + name + "' [new operator]."; + printHipifyMessage(*SM, sl, msg); + } + } + } + return false; + } + + bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) { StringRef refName = "cudaSharedIncompleteArrayVar"; if (const VarDecl *sharedVar = Result.Nodes.getNodeAs(refName)) { @@ -3239,6 +3268,7 @@ public: if (cudaParamDecl(Result)) break; if (cudaParamDeclPtr(Result)) break; if (cudaLaunchKernel(Result)) break; + if (cudaNewOperatorDecl(Result)) break; if (cudaSharedIncompleteArrayVar(Result)) break; if (stringLiteral(Result)) break; if (unresolvedTemplateName(Result)) break; @@ -3336,6 +3366,13 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac hasType(incompleteArrayType()))) .bind("cudaSharedIncompleteArrayVar"), Callback); + // Example: + // CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + // hipJitOption *jitOptions = new hipJitOption[jitNumOptions]; + Finder.addMatcher(cxxNewExpr(isExpansionInMainFile(), + hasType(pointsTo(namedDecl(matchesName("cu.*|CU.*"))))) + .bind("cudaNewOperatorDecl"), + Callback); } int64_t printStats(const std::string &csvFile, const std::string &srcFile, From 7cfe07cff4ffbc816ee99ca07316bd46c211f9a0 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 22:59:22 -0500 Subject: [PATCH 098/171] Fix trace category for hipHostMalloc --- src/hip_memory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index fc2ada134e..3f95cd22b4 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -245,7 +245,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { - HIP_INIT_SPECIAL_API((TRACE_MCMD), ptr, sizeBytes, flags); + HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, sizeBytes, flags); HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; From 2b253a48b64f9c15bbef1ce37594bdba57e18307 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 22:59:54 -0500 Subject: [PATCH 099/171] Use accelerator_scope for create_marker and create_blocking_marker. As optimization when system-scope is not needed. --- src/hip_hcc.cpp | 4 ++-- src/math_functions.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 979a2e5028..efa05cbb93 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -352,7 +352,7 @@ void ihipStream_t::locked_waitEvent(hipEvent_t event) this->ensureHaveQueue(crit); - crit->_av.create_blocking_marker(event->_marker); + crit->_av.create_blocking_marker(event->_marker, hc::accelerator_scope); } // Create a marker in this stream. @@ -1490,7 +1490,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) // ensure any commands sent to this stream wait on the NULL stream before continuing LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); // TODO - could be "noret" version of create_blocking_marker - thisStreamCrit->_av.create_blocking_marker(dcf); + thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope); } } } diff --git a/src/math_functions.cpp b/src/math_functions.cpp index f1e52c4036..151627fc73 100644 --- a/src/math_functions.cpp +++ b/src/math_functions.cpp @@ -942,10 +942,10 @@ __host__ float normcdfinvf(float x) return std::sqrt(2) * erfinvf(2*x-1); } -__host__ double normcdfinv(double x) -{ - return std::sqrt(2) * erfinv(2*x-1); -} +//__host__ double normcdfinv(double x) +//{ +// return std::sqrt(2) * erfinv(2*x-1); +//} __host__ float nextafterf(float x, float y) { From 236ce70e942945f55213c0000567b47ee4eb797d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 23:14:38 -0500 Subject: [PATCH 100/171] Expand test to cover copy followed by event sync --- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 81 ++++++++++++++++--- 1 file changed, 70 insertions(+), 11 deletions(-) diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 80ff7ad98d..d12b07289b 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -41,8 +41,14 @@ unsigned p_count = 100; // Structure for one stream; template class Streamer { + +#define COMMAND_ADD_FORWARD 0 +#define COMMAND_ADD_REVERSE 1 +#define COMMAND_COPY 2 + + public: - Streamer(int deviceId, T *input, size_t numElements, bool reverse); + Streamer(int deviceId, T *input, size_t numElements, int commandType); ~Streamer(); void runAsyncAfter(Streamer *depStreamer, bool waitSameStream=false); void runAsyncWaitSameStream(); @@ -57,7 +63,11 @@ public: size_t mismatchCount() const { return _mismatchCount; }; T *C_d() { return _C_d; }; + // How much does this streamer add to A[i] after running runAsyncAfter + int expectedAdd() const { return (_commandType == COMMAND_COPY) ? 0 : p_count; }; + + int _commandType; // 0=addReverse, 1=addFwd, 2=move private: T *_C_h; @@ -71,22 +81,23 @@ private: int _deviceId; size_t _numElements; - bool _reverse; size_t _mismatchCount; }; template -Streamer::Streamer(int deviceId, T * A_d, size_t numElements, bool reverse) : +Streamer::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) : _preA_d(NULL), _A_d(A_d), _deviceId(deviceId), _numElements(numElements), - _reverse(reverse) + _commandType(commandType) { size_t sizeElements = numElements * sizeof(int); + //if (commandType == 0) _commandType = 1; // TODO - remove me + HIPCHECK(hipSetDevice(_deviceId)); @@ -115,6 +126,23 @@ Streamer::Streamer(int deviceId, T * A_d, size_t numElements, bool reverse) : }; +template +Streamer::~Streamer() +{ + HIPCHECK(hipSetDevice(_deviceId)); + + printf ("info: ~Streamer\n"); + if (_preA_d) { + HIPCHECK(hipFree(_preA_d)); + } + HIPCHECK(hipFree(_C_d)); + HIPCHECK(hipHostFree(_C_h)); + + HIPCHECK(hipStreamDestroy(_stream)); + HIPCHECK(hipEventDestroy(_event)); +} + + template void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) { @@ -134,10 +162,14 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - if (_reverse) { + if (_commandType == COMMAND_ADD_REVERSE) { hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); - } else { + } else if (_commandType == COMMAND_ADD_FORWARD) { hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + } else if (_commandType == COMMAND_COPY) { + HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); + } else { + assert(0); // bad command type } HIPCHECK(hipEventRecord(_event, _stream)); @@ -263,9 +295,13 @@ void checkAll(int initValue, std::vector &streamers, std::vector< } + int expected = 0; // Check in forward order so we can find first mismatch: for (int i=0; icheck(i+1, initValue, (i+1)*p_count, expectPass); + + expected += streamers[i]->expectedAdd(); + + mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass); } if (!expectPass && (mismatchCount==0)) { @@ -305,7 +341,7 @@ void sync_allDevices(int numDevices) void sync_queryAllUntilComplete(std::vector streamers) { - for (int i=0; i=0; i--) { streamers[i]->queryUntilComplete(); }; } @@ -334,8 +370,6 @@ int main(int argc, char *argv[]) - std::vector streamers; - std::vector streamersDev0; // streamers for first device. size_t numElements = N; size_t sizeElements = numElements * sizeof(int); @@ -361,9 +395,13 @@ int main(int argc, char *argv[]) HIPCHECK(hipGetDeviceCount(&numDevices)); numDevices = min(2, numDevices); // multi-GPU to 2 device. + std::vector streamers; + std::vector streamersDev0; // streamers for first device. + for (int d=0; dC_d() : initArray_d, numElements, i&1 /*reverse?*/); + int command = (i%2) ? COMMAND_ADD_FORWARD : COMMAND_ADD_REVERSE; + IntStreamer * s = new IntStreamer(d, i ? streamers.back()->C_d() : initArray_d, numElements, command); streamers.push_back(s); if (d==0) { streamersDev0.push_back(s); @@ -371,6 +409,10 @@ int main(int argc, char *argv[]) } } + + + + // A sideband stream channel that is independent from above. // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is // asynchronous wrt the other streams. @@ -383,7 +425,10 @@ int main(int argc, char *argv[]) // Tests on first GPU: + // + // This test has no synchronization - make sure it mismatches so we can ensure the other tests properyl prevent the mismatch: RUN_SYNC_TEST(0x01, streamersDev0, sync_none(), false); + RUN_SYNC_TEST(0x02, streamersDev0, sync_allDevices(numDevices), true); RUN_SYNC_TEST(0x04, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); RUN_SYNC_TEST(0x08, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); @@ -419,5 +464,19 @@ int main(int argc, char *argv[]) } + // Change Adds to copies to stimulate different case with event followign copy: + for (auto &s : streamers) { + if (s->_commandType == COMMAND_ADD_FORWARD) + s->_commandType = COMMAND_COPY; + } + + + { + printf ("test: alternating memcpy/count-reverse followed by event\n"); + RUN_SYNC_TEST(0x4000, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); + RUN_SYNC_TEST(0x8000, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); + } + + passed(); } From d0ef9d8462d91de9ff0d495b125f099178f5c444 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 23:47:56 -0500 Subject: [PATCH 101/171] Remove HIP_MAX_QUEUES (replaced with HCC_MAX_QUEUES) --- src/hip_hcc.cpp | 91 +----------------------------------------- src/hip_hcc_internal.h | 12 ------ src/hip_memory.cpp | 3 -- src/hip_stream.cpp | 6 +-- 4 files changed, 4 insertions(+), 108 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index efa05cbb93..e77c4186e8 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -64,7 +64,6 @@ std::string HIP_LAUNCH_BLOCKING_KERNELS; std::vector g_hipLaunchBlockingKernels; int HIP_API_BLOCKING = 0; -int HIP_MAX_QUEUES = 0; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; @@ -267,31 +266,6 @@ ihipStream_t::~ihipStream_t() } -inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit) -{ - if (HIP_MAX_QUEUES && !streamCrit->_hasQueue) { - - // To avoid deadlock, we have to release the stream lock before acquiring context lock. - // Else we can get hung if another thread has the context lock is trying to get lock for this stream. - // We lock it again below. - streamCrit->munlock(); - - // Obtain mutex access to the device critical data, release by destructor - LockedAccessor_CtxCrit_t ctxCrit(this->_ctx->criticalData()); - // TODO - auto needyCritPtr = this->_criticalData.mlock(); - - // Second test to ensure we still need to steal the queue - another thread may have - // snuck in here and already solved the issue. - if (!needyCritPtr->_hasQueue) { - needyCritPtr->_av = this->_ctx->stealActiveQueue(ctxCrit, this); - } - - streamCrit->_hasQueue = true; - } - assert(streamCrit->_hasQueue); -} - hc::hcWaitMode ihipStream_t::waitMode() const { hc::hcWaitMode waitMode = hc::hcWaitModeActive; @@ -323,13 +297,9 @@ hc::hcWaitMode ihipStream_t::waitMode() const //This signature should be used in routines that already have locked the stream mutex void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit) { - if (crit->_hasQueue) { - tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); + tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); - crit->_av.wait(waitMode()); - } else { - tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str()); - } + crit->_av.wait(waitMode()); crit->_kernelCnt = 0; } @@ -350,7 +320,6 @@ void ihipStream_t::locked_waitEvent(hipEvent_t event) { LockedAccessor_StreamCrit_t crit(_criticalData); - this->ensureHaveQueue(crit); crit->_av.create_blocking_marker(event->_marker, hc::accelerator_scope); } @@ -362,7 +331,6 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) // Lock the stream to prevent simultaneous access LockedAccessor_StreamCrit_t crit(_criticalData); - this->ensureHaveQueue(crit); #if USE_NO_SCOPE printf ("create_marker, flags = %x\n", event->_flags); event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); @@ -406,7 +374,6 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() crit->_kernelCnt = 0; } - this->ensureHaveQueue(crit); @@ -1001,55 +968,6 @@ std::string ihipCtx_t::toString() const }; -hc::accelerator_view -ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream) -{ - - // TODO - review handling if queue can't be found. - while (1) { - - for (auto iter=ctxCrit->streams().begin(); iter != ctxCrit->streams().end(); iter++) { - if (*iter != needyStream) { - auto victimCritPtr = (*iter)->_criticalData.mtry_lock(); - if (victimCritPtr) { - // try-lock succeeded: - if (victimCritPtr->_hasQueue && (victimCritPtr->_kernelCnt == 0)) { - - victimCritPtr->_hasQueue = false; - - tprintf(DB_SYNC, " stealActiveQueue from victim:%s to needy:%s\n", - ToString(*iter).c_str(), ToString(needyStream).c_str()); - - hc::accelerator_view av = victimCritPtr->_av; - - // TODO - cleanup to remove forced setting to N - uint64_t *p = (uint64_t*)(&victimCritPtr->_av); - *p = 0; // damage the victim av so attempt to use it will fault. - - (*iter)->_criticalData.munlock(); - return av; - } - (*iter)->_criticalData.munlock(); - } - } - } - } -} - - -hc::accelerator_view -ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit) -{ - if (HIP_MAX_QUEUES && (ctxCrit->streams().size() >= HIP_MAX_QUEUES)) { - // Steal a queue from an existing stream: - hc::accelerator_view av = this->stealActiveQueue (ctxCrit, nullptr); - return av; - } else { - // Create a new view - return getWriteableDevice()->_acc.create_view(); - } -} - //---- @@ -1279,7 +1197,6 @@ void HipReadEnv() READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed. Impacts hipMemcpyAsync, hipMemsetAsync." ); - READ_ENV_I(release, HIP_MAX_QUEUES, 0, "Maximum number of queues that this app will use per-device. Additional streams will share the specified number of queues. 0=no limit."); READ_ENV_C(release, HIP_DB, 0, "Print debug info. Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback); if ((HIP_DB & (1<ensureHaveQueue(crit); crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); } @@ -2078,7 +1994,6 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes // Perform fast asynchronous copy - we know copyDevice != NULL based on check above try { - this->ensureHaveQueue(crit); if (HIP_FORCE_SYNC_COPY) { crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc, forceUnpinnedCopy); @@ -2115,7 +2030,6 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes // Perform slow synchronous copy: LockedAccessor_StreamCrit_t crit(_criticalData); - this->ensureHaveQueue(crit); crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); } @@ -2170,7 +2084,6 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc) //--- -// Warning - with HIP_MAX_QUEUES!=0 there is no mechanism to prevent accelerator_view from being re-assigned... hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av) { HIP_INIT_API(stream, av); diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 0d080f9225..278f52dc51 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -447,7 +447,6 @@ public: ihipStreamCriticalBase_t(ihipStream_t *parentStream, hc::accelerator_view av) : _kernelCnt(0), _av(av), - _hasQueue(true), _parent(parentStream) { }; @@ -473,11 +472,6 @@ public: uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait(). hc::accelerator_view _av; - - // True if the stream has an allocated queue (accelerato_view) for its use: - // Always true at ihipStream creation but queue may later be stolen. - // This acts as a valid bit for the _av. - bool _hasQueue; private: }; @@ -544,8 +538,6 @@ public: const ihipDevice_t * getDevice() const; ihipCtx_t * getCtx() const; - void ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit); - public: //--- //Public member vars - these are set at initialization and never change: @@ -792,10 +784,6 @@ public: // Functions: void locked_waitAllStreams(); void locked_syncDefaultStream(bool waitOnSelf, bool syncHost); - // Will allocate a queue and assign it to the needyStream: - hc::accelerator_view stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream); - hc::accelerator_view createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit); - ihipCtxCritical_t &criticalData() { return _criticalData; }; const ihipDevice_t *getDevice() const { return _device; }; diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 3f95cd22b4..a3d761752e 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -948,7 +948,6 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s if (stream) { auto crit = stream->lockopen_preKernelCommand(); - stream->ensureHaveQueue(crit); hc::completion_future cf ; @@ -1000,7 +999,6 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) if (stream) { auto crit = stream->lockopen_preKernelCommand(); - stream->ensureHaveQueue(crit); hc::completion_future cf ; if ((sizeBytes & 0x3) == 0) { @@ -1053,7 +1051,6 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte if (stream) { auto crit = stream->lockopen_preKernelCommand(); - stream->ensureHaveQueue(crit); hc::completion_future cf ; if ((sizeBytes & 0x3) == 0) { diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index 34b4bc8851..b4a0740b96 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -49,7 +49,7 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags) // Obtain mutex access to the device critical data, release by destructor LockedAccessor_CtxCrit_t ctxCrit(ctx->criticalData()); - auto istream = new ihipStream_t(ctx, ctx->createOrStealQueue(ctxCrit), flags); + auto istream = new ihipStream_t(ctx, acc.create_view(), flags); ctxCrit->addStream(istream); *stream = istream; @@ -129,9 +129,7 @@ hipError_t hipStreamQuery(hipStream_t stream) { LockedAccessor_StreamCrit_t crit(stream->_criticalData); - if (crit->_hasQueue) { - pendingOps = crit->_av.get_pending_async_ops(); - } + pendingOps = crit->_av.get_pending_async_ops(); } From 75f691ec2fdfc424ab75cac71db453b01bd2cc73 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 24 May 2017 00:48:10 -0500 Subject: [PATCH 102/171] Add hipHostMallocCoherent, hipHostMallocNonCoherent Provide per-allocation control over coherent/non-coherent mem. These overrid the default HIP_COHERENT_HOST_ALLOC setting. --- include/hip/hcc_detail/hip_runtime_api.h | 10 ++- include/hip/nvcc_detail/hip_runtime_api.h | 2 + src/hip_hcc.cpp | 2 +- src/hip_memory.cpp | 31 +++++-- tests/src/runtimeApi/memory/hipHostMalloc.cpp | 83 +++++++++++++++---- util/vim/hip.vim | 2 + 6 files changed, 104 insertions(+), 26 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 34ed2ed5ce..6fb7c0256e 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -111,17 +111,21 @@ enum hipLimit_t //! Flags that can be used with hipHostMalloc #define hipHostMallocDefault 0x0 -#define hipHostMallocPortable 0x1 -#define hipHostMallocMapped 0x2 +#define hipHostMallocPortable 0x1 ///< Memory is considered allocated by all contexts. +#define hipHostMallocMapped 0x2 ///< Map the allocation into the address space for the current device. The device pointer can be obtained with #hipHostGetDevicePointer. #define hipHostMallocWriteCombined 0x4 +#define hipHostMallocCoherent 0x40000000 ///< Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation. +#define hipHostMallocNonCoherent 0x80000000 ///< Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation. + //! Flags that can be used with hipHostRegister #define hipHostRegisterDefault 0x0 ///< Memory is Mapped and Portable -#define hipHostRegisterPortable 0x1 ///< Memory is considered registered by all contexts. HIP only supports one context so this is always assumed true. +#define hipHostRegisterPortable 0x1 ///< Memory is considered registered by all contexts. #define hipHostRegisterMapped 0x2 ///< Map the allocation into the address space for the current device. The device pointer can be obtained with #hipHostGetDevicePointer. #define hipHostRegisterIoMemory 0x4 ///< Not supported. + #define hipDeviceScheduleAuto 0x0 ///< Automatically select between Spin and Yield #define hipDeviceScheduleSpin 0x1 ///< Dedicate a CPU core to spin-wait. Provides lowest latency, but burns a CPU core and may consume more power. #define hipDeviceScheduleYield 0x2 ///< Yield the CPU to the operating system when waiting. May increase latency, but lowers power and is friendlier to other threads in the system. diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index 01a93f7ba4..cbc7ed9f9c 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -65,6 +65,8 @@ hipMemcpyHostToHost #define hipHostMallocPortable cudaHostAllocPortable #define hipHostMallocMapped cudaHostAllocMapped #define hipHostMallocWriteCombined cudaHostAllocWriteCombined +#define hipHostMallocCoherent 0x0 +#define hipHostMallocNonCoherent 0x0 #define hipHostRegisterPortable cudaHostRegisterPortable #define hipHostRegisterMapped cudaHostRegisterMapped diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index e77c4186e8..4588f67c2d 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -74,7 +74,7 @@ int HIP_PROFILE_API= 0; std::string HIP_DB_START_API; std::string HIP_DB_STOP_API; int HIP_DB= 0; -int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ +int HIP_VISIBLE_DEVICES = 0; int HIP_NUM_KERNELS_INFLIGHT = 128; int HIP_WAIT_MODE = 0; diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index a3d761752e..3ab7713afa 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -267,17 +267,36 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) trueFlags = hipHostMallocMapped | hipHostMallocPortable; } - const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined; - if (flags & ~supportedFlags) { + const unsigned supportedFlags = hipHostMallocPortable + | hipHostMallocMapped + | hipHostMallocWriteCombined + | hipHostMallocCoherent + | hipHostMallocNonCoherent; + + + const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; + + if ((flags & ~supportedFlags) || + ((flags & coherencyFlags) == coherencyFlags)) { + *ptr = nullptr; + // can't specify unsupported flags, can't specify both Coherent + NonCoherent hip_status = hipErrorInvalidValue; - } - else { + } else { auto device = ctx->getWriteableDevice(); - unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + + unsigned amFlags = 0; + if (flags & hipHostMallocCoherent) { + amFlags = amHostCoherent; + } else if (flags & hipHostMallocNonCoherent) { + amFlags = amHostPinned; + } else { + // depends on env variables: + amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + } - *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host", + *ptr = hip_internal::allocAndSharePtr((amFlags & amHostCoherent) ? "finegrained_host":"pinned_host", sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags); if(sizeBytes && (*ptr == NULL)){ diff --git a/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/tests/src/runtimeApi/memory/hipHostMalloc.cpp index d6b3b05a1d..31596b5ea5 100644 --- a/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -31,14 +31,19 @@ #define LEN 1024*1024 #define SIZE LEN*sizeof(float) -__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){ +__global__ void Add(float *Ad, float *Bd, float *Cd){ int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Cd[tx] = Ad[tx] + Bd[tx]; } + +__global__ void Set(int *Ad, int val){ + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + Ad[tx] = val; +} + int main(){ - float *A, *B, *C; - float *Ad, *Bd, *Cd; + hipDeviceProp_t prop; int device; @@ -49,26 +54,72 @@ int main(){ failed("Does support HostPinned Memory"); } - HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped)); - HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault)); - HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped)); - HIPCHECK(hipHostGetDevicePointer((void**)&Ad, A, 0)); - HIPCHECK(hipHostGetDevicePointer((void**)&Cd, C, 0)); + { + float *A, *B, *C; + float *Ad, *Bd, *Cd; + HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped)); + HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault)); + HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped)); - for(int i=0;i Date: Wed, 24 May 2017 01:03:28 -0500 Subject: [PATCH 103/171] Remove HIP_NUM_KERNELS_INFLIGHT. (redundant with HCC controls) --- docs/markdown/hip_porting_guide.md | 1 - src/hip_hcc.cpp | 10 ---------- 2 files changed, 11 deletions(-) diff --git a/docs/markdown/hip_porting_guide.md b/docs/markdown/hip_porting_guide.md index 72f6384f6d..84887fd512 100644 --- a/docs/markdown/hip_porting_guide.md +++ b/docs/markdown/hip_porting_guide.md @@ -569,7 +569,6 @@ HIP_TRACE_API = 0 : Trace each HIP API call. Print function n HIP_TRACE_API_COLOR = green : Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White HIP_PROFILE_API = 0 : Add HIP function begin/end to ATP file generated with CodeXL HIP_VISIBLE_DEVICES = 0 : Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence -HIP_NUM_KERNELS_INFLIGHT = 128 : Number of kernels per stream ``` diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 4588f67c2d..8e4a20ad74 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -75,7 +75,6 @@ std::string HIP_DB_START_API; std::string HIP_DB_STOP_API; int HIP_DB= 0; int HIP_VISIBLE_DEVICES = 0; -int HIP_NUM_KERNELS_INFLIGHT = 128; int HIP_WAIT_MODE = 0; int HIP_FORCE_P2P_HOST = 0; @@ -369,13 +368,6 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/); - if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){ - this->wait(crit); - crit->_kernelCnt = 0; - } - - - return crit; } @@ -1225,8 +1217,6 @@ void HipReadEnv() READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations. May help stability"); READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions"); - // TODO - review, can we remove this? - READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); From c8632156114577404bd1279aa3903c7a022f4409 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 24 May 2017 18:25:40 +0300 Subject: [PATCH 104/171] [FIX] [HIPIFY] Add matchers for function return types. https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/73 Examples (https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/trivial_copy.inl): template cudaStream_t cuda_memcpy_stream(const thrust::cpp::execution_policy &, const thrust::cuda::execution_policy &exec) template cudaMemcpyKind cuda_memcpy_kind(const thrust::cuda::execution_policy &, const thrust::cpp::execution_policy &) --- hipify-clang/src/Cuda2Hip.cpp | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index dcb9c3d216..f17c3e2646 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -3123,6 +3123,33 @@ private: return false; } + bool cudaFunctionReturn(const MatchFinder::MatchResult &Result) { + if (const auto *ret = Result.Nodes.getNodeAs("cudaFunctionReturn")) { + QualType QT = ret->getReturnType(); + SourceManager *SM = Result.SourceManager; + SourceRange sr = ret->getReturnTypeSourceRange(); + SourceLocation sl = sr.getBegin(); + std::string name = QT.getAsString(); + if (QT.getTypePtr()->isEnumeralType()) { + name = QT.getTypePtr()->getAs()->getDecl()->getNameAsString(); + } + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + updateCounters(found->second, name); + if (!found->second.unsupported) { + StringRef repName = found->second.hipName; + Replacement Rep(*SM, sl, name.size(), repName); + FullSourceLoc fullSL(sl, *SM); + insertReplacement(Rep, fullSL); + } + } + else { + std::string msg = "the following reference is not handled: '" + name + "' [function return]."; + printHipifyMessage(*SM, sl, msg); + } + } + return false; + } bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) { StringRef refName = "cudaSharedIncompleteArrayVar"; @@ -3269,6 +3296,7 @@ public: if (cudaParamDeclPtr(Result)) break; if (cudaLaunchKernel(Result)) break; if (cudaNewOperatorDecl(Result)) break; + if (cudaFunctionReturn(Result)) break; if (cudaSharedIncompleteArrayVar(Result)) break; if (stringLiteral(Result)) break; if (unresolvedTemplateName(Result)) break; @@ -3373,6 +3401,16 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac hasType(pointsTo(namedDecl(matchesName("cu.*|CU.*"))))) .bind("cudaNewOperatorDecl"), Callback); + // Examples: + // 1. + // cudaStream_t cuda_memcpy_stream(...) + // 2. + // template cudaMemcpyKind cuda_memcpy_kind(...) + Finder.addMatcher(functionDecl(isExpansionInMainFile(), + returns(hasDeclaration(namedDecl(matchesName("cu.*|CU.*"))))) + .bind("cudaFunctionReturn"), + Callback); + } int64_t printStats(const std::string &csvFile, const std::string &srcFile, From 1dce01f9bb085e6288852af5657ededd8d384dcc Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Thu, 25 May 2017 23:15:30 -0400 Subject: [PATCH 105/171] fix hip_fast_dsqrt* to call a double fp sqrt function --- src/device_util.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/device_util.cpp b/src/device_util.cpp index b730412874..bea42aba46 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -1215,20 +1215,23 @@ __device__ float __hip_fast_tanf(float x) { } // Double Precision Math +// FIXME - HCC doesn't have a fast_math version double FP sqrt +// Another issue is that these intrinsics call for a specific rounding mode; +// however, their implementation all map to the same sqrt builtin __device__ double __hip_fast_dsqrt_rd(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ double __hip_fast_dsqrt_rn(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ double __hip_fast_dsqrt_ru(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ double __hip_fast_dsqrt_rz(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ void __threadfence_system(void){ From b251d72917f9832d2062cd8def60fa84b839376a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 25 May 2017 10:37:03 -0500 Subject: [PATCH 106/171] Add isDefaultStream() accessor. Fix code that checked for stream==nullptr after stream had been resolved to a "true stream". --- src/hip_event.cpp | 19 +++++++++++++++---- src/hip_hcc_internal.h | 5 ++++- src/hip_stream.cpp | 4 ++-- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/src/hip_event.cpp b/src/hip_event.cpp index fbaf5cc463..c11a47b341 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -53,14 +53,19 @@ void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihip void ihipEvent_t::setTimestamp() { + bool isReady0 = _marker.is_ready(); + bool isReady1; + int val = 0; if (_state == hipEventStatusRecorded) { // already recorded, done: return; } else { // TODO - use completion-future functions to obtain ticks and timestamps: hsa_signal_t *sig = static_cast (_marker.get_native_handle()); + isReady1 = _marker.is_ready(); if (sig) { - if (hsa_signal_load_acquire(*sig) == 0) { + val = hsa_signal_load_acquire(*sig); + if (val == 0) { if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) { _timestamp = _marker.get_end_tick(); @@ -75,6 +80,10 @@ void ihipEvent_t::setTimestamp() } } } + + if (_state != hipEventStatusRecorded) { + printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); + } } @@ -118,11 +127,11 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) event->_stream = stream; - if (HIP_SYNC_NULL_STREAM && stream == NULL) { + if (HIP_SYNC_NULL_STREAM && stream->isDefaultStream()) { // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0 - // If stream == NULL, wait on all queues. + // If default stream , then wait on all queues. ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true, true); @@ -167,7 +176,7 @@ hipError_t hipEventSynchronize(hipEvent_t event) } else if (event->_state == hipEventStatusCreated ) { // Created but not actually recorded on any device: return ihipLogStatus(hipSuccess); - } else if (HIP_SYNC_NULL_STREAM && (event->_stream == NULL)) { + } else if (HIP_SYNC_NULL_STREAM && (event->_stream->isDefaultStream() )) { auto *ctx = ihipGetTlsDefaultCtx(); // TODO-HIP_SYNC_NULL_STREAM - can remove this code ctx->locked_syncDefaultStream(true, true); @@ -175,6 +184,8 @@ hipError_t hipEventSynchronize(hipEvent_t event) } else { event->_marker.wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); + assert (event->_marker.is_ready()); + return ihipLogStatus(hipSuccess); } } else { diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 278f52dc51..94ad4f9340 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -538,10 +538,12 @@ public: const ihipDevice_t * getDevice() const; ihipCtx_t * getCtx() const; + bool isDefaultStream() const { return _id == 0; }; + public: //--- //Public member vars - these are set at initialization and never change: - SeqNum_t _id; // monotonic sequence ID + SeqNum_t _id; // monotonic sequence ID. 0 is the default stream. unsigned _flags; @@ -560,6 +562,7 @@ private: void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes); + public: // TODO - move private // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t ihipStreamCritical_t _criticalData; diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index b4a0740b96..9f1228d6f7 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -146,7 +146,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) hipError_t e = hipSuccess; - if (stream == NULL) { + if (stream == hipStreamNull) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/); } else { @@ -198,7 +198,7 @@ hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags) if (flags == NULL) { return ihipLogStatus(hipErrorInvalidValue); - } else if (stream == NULL) { + } else if (stream == hipStreamNull) { return ihipLogStatus(hipErrorInvalidResourceHandle); } else { *flags = stream->_flags; From be8d0ba644f0897ed0111a5cabbecb72fc7145d4 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 27 May 2017 15:55:07 -0500 Subject: [PATCH 107/171] Updates so hip compiles on CUDA. --- include/hip/hcc_detail/hip_runtime_api.h | 25 +++++++++++++++++++ include/hip/hip_runtime_api.h | 23 ----------------- tests/src/runtimeApi/stream/hipNullStream.cpp | 2 +- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 2 +- tests/src/test_common.h | 2 +- 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 6fb7c0256e..a8db84c4f2 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -136,6 +136,31 @@ enum hipLimit_t #define hipDeviceLmemResizeToMax 0x16 +/* +* @brief hipJitOption +* @enum +* @ingroup Enumerations +*/ +typedef enum hipJitOption { + hipJitOptionMaxRegisters = 0, + hipJitOptionThreadsPerBlock, + hipJitOptionWallTime, + hipJitOptionInfoLogBuffer, + hipJitOptionInfoLogBufferSizeBytes, + hipJitOptionErrorLogBuffer, + hipJitOptionErrorLogBufferSizeBytes, + hipJitOptionOptimizationLevel, + hipJitOptionTargetFromContext, + hipJitOptionTarget, + hipJitOptionFallbackStrategy, + hipJitOptionGenerateDebugInfo, + hipJitOptionLogVerbose, + hipJitOptionGenerateLineInfo, + hipJitOptionCacheMode, + hipJitOptionNumOptions +} hipJitOption; + + /** * @warning On AMD devices and recent Nvidia devices, these hints and controls are ignored. */ diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h index fa54dda5dc..dc163d5c25 100644 --- a/include/hip/hip_runtime_api.h +++ b/include/hip/hip_runtime_api.h @@ -250,29 +250,6 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. } hipDeviceAttribute_t; -/* -* @brief hipJitOption -* @enum -* @ingroup Enumerations -*/ -typedef enum hipJitOption { - hipJitOptionMaxRegisters = 0, - hipJitOptionThreadsPerBlock, - hipJitOptionWallTime, - hipJitOptionInfoLogBuffer, - hipJitOptionInfoLogBufferSizeBytes, - hipJitOptionErrorLogBuffer, - hipJitOptionErrorLogBufferSizeBytes, - hipJitOptionOptimizationLevel, - hipJitOptionTargetFromContext, - hipJitOptionTarget, - hipJitOptionFallbackStrategy, - hipJitOptionGenerateDebugInfo, - hipJitOptionLogVerbose, - hipJitOptionGenerateLineInfo, - hipJitOptionCacheMode, - hipJitOptionNumOptions -} hipJitOption; /** * @} diff --git a/tests/src/runtimeApi/stream/hipNullStream.cpp b/tests/src/runtimeApi/stream/hipNullStream.cpp index 380979f6bc..b610315608 100644 --- a/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * RUN: %t * HIT_END */ diff --git a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index d12b07289b..9bbd43828c 100644 --- a/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * RUN: %t * HIT_END */ diff --git a/tests/src/test_common.h b/tests/src/test_common.h index bb44c94745..81edca4e1e 100644 --- a/tests/src/test_common.h +++ b/tests/src/test_common.h @@ -249,7 +249,7 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h, } } - setDefaultData(N, A_h ? *A_h : nullptr, B_h ? *B_h : nullptr, C_h ? *C_h : nullptr); + setDefaultData(N, A_h ? *A_h : NULL, B_h ? *B_h : NULL, C_h ? *C_h : NULL); } From 620eb3069121ffec946ec67e4ff4e8f351585648 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 26 May 2017 14:48:27 -0500 Subject: [PATCH 108/171] Cleanup hipEvent. (Intermediate checkpoint) Support hipEventDisableSystemRelease flag. Update test. Remove stray printf --- src/hip_event.cpp | 28 ++--- src/hip_hcc.cpp | 6 +- tests/src/runtimeApi/memory/hipHostMalloc.cpp | 102 ++++++++++++++---- 3 files changed, 100 insertions(+), 36 deletions(-) diff --git a/src/hip_event.cpp b/src/hip_event.cpp index c11a47b341..71f6d8ed5b 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -82,7 +82,7 @@ void ihipEvent_t::setTimestamp() } if (_state != hipEventStatusRecorded) { - printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); + //printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); } } @@ -92,7 +92,10 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) hipError_t e = hipSuccess; // TODO-IPC - support hipEventInterprocess. - unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming; + unsigned supportedFlags = hipEventDefault + | hipEventBlockingSync + | hipEventDisableTiming + | hipEventDisableSystemRelease; if ((flags & ~supportedFlags) == 0) { ihipEvent_t *eh = new ihipEvent_t(flags); @@ -197,20 +200,18 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) { HIP_INIT_API(ms, start, stop); - ihipEvent_t *start_eh = start; - ihipEvent_t *stop_eh = stop; - start->setTimestamp(); stop->setTimestamp(); hipError_t status = hipSuccess; *ms = 0.0f; - if (start_eh && stop_eh) { - if ((start_eh->_state == hipEventStatusRecorded) && (stop_eh->_state == hipEventStatusRecorded)) { + if (start && stop) { + // refresh status: + if ((start->_state == hipEventStatusRecorded) && (stop->_state == hipEventStatusRecorded)) { // Common case, we have good information for both events. - int64_t tickDiff = (stop_eh->timestamp() - start_eh->timestamp()); + int64_t tickDiff = (stop->timestamp() - start->timestamp()); uint64_t freqHz; hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); @@ -223,13 +224,16 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) } - } else if ((start_eh->_state == hipEventStatusRecording) || - (stop_eh->_state == hipEventStatusRecording)) { + } else if ((start->_state == hipEventStatusRecording) || + (stop->_state == hipEventStatusRecording)) { + status = hipErrorNotReady; - } else if ((start_eh->_state == hipEventStatusUnitialized) || - (stop_eh->_state == hipEventStatusUnitialized)) { + } else if ((start->_state == hipEventStatusUnitialized) || + (stop->_state == hipEventStatusUnitialized)) { status = hipErrorInvalidResourceHandle; } + } else { + status = hipErrorInvalidResourceHandle; } return ihipLogStatus(status); diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 8e4a20ad74..5e13904521 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -49,7 +49,7 @@ THE SOFTWARE. // needs HCC change for hc::no_scope -#define USE_NO_SCOPE 0 +#define USE_NO_SCOPE 1 //================================================================================================= //Global variables: @@ -331,10 +331,10 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) LockedAccessor_StreamCrit_t crit(_criticalData); #if USE_NO_SCOPE - printf ("create_marker, flags = %x\n", event->_flags); + //printf ("create_marker, flags = %x\n", event->_flags); event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); #else - event->_marker = crit->_av.create_marker(); + event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::accelerator_scope : hc::system_scope); #endif }; diff --git a/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 31596b5ea5..0e88570e17 100644 --- a/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -42,6 +42,63 @@ __global__ void Set(int *Ad, int val){ Ad[tx] = val; } + +#define SYNC_EVENT 0 +#define SYNC_STREAM 1 +#define SYNC_DEVICE 2 + +std::vector syncMsg = {"event", "stream", "device"}; + +void CheckHostPointer(int numElements, int *ptr, int syncMethod, std::string msg) +{ + std::cerr << "test: CheckHostPointer " << msg + << " ptr=" << ptr + << " syncMethod=" << syncMsg[syncMethod] << "\n"; + + hipStream_t s; + hipEvent_t e; + + // Init: + HIPCHECK(hipStreamCreate(&s)); + HIPCHECK(hipEventCreateWithFlags(&e, hipEventDisableSystemRelease)); + dim3 dimBlock(64,1,1); + dim3 dimGrid(numElements/dimBlock.x,1,1); + + const int expected = 13; + + // Init array to know state: + hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, 0x0, ptr, -42); + HIPCHECK(hipDeviceSynchronize()); + + hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, s, ptr, expected); + HIPCHECK(hipEventRecord(e, s)); + + // Host waits for event : + switch (syncMethod) { + case SYNC_EVENT: + HIPCHECK(hipEventSynchronize(e)); + break; + case SYNC_STREAM: + HIPCHECK(hipStreamSynchronize(s)); + break; + case SYNC_DEVICE: + HIPCHECK(hipDeviceSynchronize()); + break; + default: + assert(0); + }; + + for (int i=0; i Date: Sat, 27 May 2017 16:01:23 -0500 Subject: [PATCH 109/171] Add event controls for release fences. Env var : HIP_EVENT_SYS_RELEASE Event allocation flags : hipEventReleaseToDevice, hipEventReleaseToSystem (remove hipEventDisableSystemRelease) Update test for new functionality. --- include/hip/hcc_detail/hip_runtime_api.h | 3 ++- include/hip/nvcc_detail/hip_runtime_api.h | 3 ++- src/hip_event.cpp | 11 ++++++-- src/hip_hcc.cpp | 24 ++++++++++++----- tests/src/runtimeApi/memory/hipHostMalloc.cpp | 27 ++++++++++++------- 5 files changed, 48 insertions(+), 20 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index a8db84c4f2..6059e1e92d 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -106,7 +106,8 @@ enum hipLimit_t #define hipEventBlockingSync 0x1 ///< Waiting will yield CPU. Power-friendly and usage-friendly but may increase latency. #define hipEventDisableTiming 0x2 ///< Disable event's capability to record timing information. May improve performance. #define hipEventInterprocess 0x4 ///< Event can support IPC. @warning - not supported in HIP. -#define hipEventDisableSystemRelease 0x80000000 /// < Disable the system-scope release that event normally performs when it records. This flag is useful to obtain more precise timings of commands between events. The flag is a no-op on CUDA platforms. +#define hipEventReleaseToDevice 0x40000000 /// < Use a device-scope release when recording this event. This flag is useful to obtain more precise timings of commands between events. The flag is a no-op on CUDA platforms. +#define hipEventReleaseToSystem 0x80000000 /// < Use a system-scope release that when recording this event. This flag is useful to make non-coherent host memory visible to the host. The flag is a no-op on CUDA platforms. //! Flags that can be used with hipHostMalloc diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index cbc7ed9f9c..b09c9323c7 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -58,7 +58,8 @@ hipMemcpyHostToHost #define hipEventBlockingSync cudaEventBlockingSync #define hipEventDisableTiming cudaEventDisableTiming #define hipEventInterprocess cudaEventInterprocess -#define hipEventDisableSystemRelease cudaEventDefault /* no-op on CUDA platform */ +#define hipEventReleaseToDevice 0 /* no-op on CUDA platform */ +#define hipEventReleaseToSystem 0 /* no-op on CUDA platform */ #define hipHostMallocDefault cudaHostAllocDefault diff --git a/src/hip_event.cpp b/src/hip_event.cpp index 71f6d8ed5b..2c31769718 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -95,8 +95,15 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming - | hipEventDisableSystemRelease; - if ((flags & ~supportedFlags) == 0) { + | hipEventReleaseToDevice + | hipEventReleaseToSystem + ; + const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem); + + const bool illegalFlags = (flags & ~supportedFlags) || // can't set any unsupported flags. + (flags & releaseFlags) == releaseFlags; // can't set both + + if (!illegalFlags) { ihipEvent_t *eh = new ihipEvent_t(flags); *event = eh; diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 5e13904521..4400e4596e 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -93,8 +93,11 @@ int HIP_SYNC_HOST_ALLOC = 1; // Sync on host between int HIP_SYNC_NULL_STREAM = 1; +// HIP needs to change some behavior based on HCC_OPT_FLUSH : int HCC_OPT_FLUSH = 0; +int HIP_EVENT_SYS_RELEASE=0; + @@ -330,12 +333,18 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) // Lock the stream to prevent simultaneous access LockedAccessor_StreamCrit_t crit(_criticalData); -#if USE_NO_SCOPE - //printf ("create_marker, flags = %x\n", event->_flags); - event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); -#else - event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::accelerator_scope : hc::system_scope); -#endif + auto scopeFlag = hc::accelerator_scope; + // The env var HIP_EVENT_SYS_RELEASE sets the default, + // The explicit flags override the env var (if specified) + if (event->_flags & hipEventReleaseToSystem) { + scopeFlag = hc::system_scope; + } else if (event->_flags & hipEventReleaseToDevice) { + scopeFlag = hc::accelerator_scope; + } else { + scopeFlag = HIP_EVENT_SYS_RELEASE ? hc::system_scope : hc::accelerator_scope; + } + + event->_marker = crit->_av.create_marker(scopeFlag); }; //============================================================================= @@ -1221,7 +1230,8 @@ void HipReadEnv() READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); - READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impacts HCC. When set, use agent-scope flush rather than system-scope flush when possible."); + READ_ENV_I(release, HCC_OPT_FLUSH, 0, "When set, use agent-scope fence operations rather than system-scope fence operationsflush when possible. This flag controls both HIP and HCC behavior."); + READ_ENV_I(release, HIP_EVENT_SYS_RELEASE, 0, "If set, event are created with hipEventReleaseToSystem by default. If 0, events are created with hipEventReleaseToDevice by default. The defaults can be overridden by specifying hipEventReleaseToSystem or hipEventReleaseToDevice flag when creating the event."); // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. if (HIP_DB && !COMPILE_HIP_DB) { diff --git a/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 0e88570e17..54073e4901 100644 --- a/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -49,9 +49,12 @@ __global__ void Set(int *Ad, int val){ std::vector syncMsg = {"event", "stream", "device"}; -void CheckHostPointer(int numElements, int *ptr, int syncMethod, std::string msg) +void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg) { std::cerr << "test: CheckHostPointer " << msg + << " eventFlags = " << std::hex << eventFlags + << ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "") + << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") << " ptr=" << ptr << " syncMethod=" << syncMsg[syncMethod] << "\n"; @@ -60,7 +63,7 @@ void CheckHostPointer(int numElements, int *ptr, int syncMethod, std::string msg // Init: HIPCHECK(hipStreamCreate(&s)); - HIPCHECK(hipEventCreateWithFlags(&e, hipEventDisableSystemRelease)); + HIPCHECK(hipEventCreateWithFlags(&e, eventFlags)) dim3 dimBlock(64,1,1); dim3 dimGrid(numElements/dimBlock.x,1,1); @@ -161,18 +164,24 @@ int main(){ int *A = nullptr; HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocNonCoherent)); const char *ptrType = "non-coherent"; // TODO - //CheckHostPointer(numElements, A, SYNC_DEVICE, ptrType); - //CheckHostPointer(numElements, A, SYNC_STREAM, ptrType); - CheckHostPointer(numElements, A, SYNC_EVENT, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_EVENT, ptrType); + + // agent-scope releases don't provide host visibility, don't use them here: } - if (0) { // TODO, remove me + if (1) { int *A = nullptr; HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent)); const char *ptrType = "coherent"; - CheckHostPointer(numElements, A, SYNC_DEVICE, ptrType); - CheckHostPointer(numElements, A, SYNC_STREAM, ptrType); - CheckHostPointer(numElements, A, SYNC_EVENT, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_EVENT, ptrType); + + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_EVENT, ptrType); } From 4ff01c971f695568ae9b71aa23f53a13e8d8e502 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 30 May 2017 15:45:22 +0530 Subject: [PATCH 110/171] Disable normcdfinvf on __host__ Change-Id: If7bfc9826a09eb9b7675ea2a417b9418759b7912 --- src/math_functions.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/math_functions.cpp b/src/math_functions.cpp index 151627fc73..f66f0a4312 100644 --- a/src/math_functions.cpp +++ b/src/math_functions.cpp @@ -937,10 +937,10 @@ __host__ void sincospi(double x, double *sptr, double *cptr) *cptr = std::cos(HIP_PI*x); } -__host__ float normcdfinvf(float x) -{ - return std::sqrt(2) * erfinvf(2*x-1); -} +//__host__ float normcdfinvf(float x) +//{ +// return std::sqrt(2) * erfinvf(2*x-1); +//} //__host__ double normcdfinv(double x) //{ From 3e99bc23e72553377cae84c3270c5e66a0be3b33 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 30 May 2017 17:58:13 +0300 Subject: [PATCH 111/171] [HIPIFY] Add more CUDA Driver API 8.0.44 Data structures. --- hipify-clang/src/Cuda2Hip.cpp | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index f17c3e2646..b163c4d20a 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -205,6 +205,7 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_ARRAY"] = {"hipErrorNotMappedAsArray", CONV_ERR, API_DRIVER}; // 212 cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_POINTER"] = {"hipErrorNotMappedAsPointer", CONV_ERR, API_DRIVER}; // 213 cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_IN_USE"] = {"hipErrorContextAlreadyInUse", CONV_ERR, API_DRIVER}; // 216 + cuda2hipRename["CUDA_ERROR_NVLINK_UNCORRECTABLE"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_DRIVER}; // 220 [CUDA 8.0.44] cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 @@ -695,6 +696,10 @@ struct cuda2hipMap { cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER}; + // unsupported yet by HIP [CUDA 8.0.44] + cuda2hipRename["CU_JIT_NEW_SM3X_OPT"] = {"hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_FAST_COMPILE"] = {"hipJitOptionFastCompile", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER}; // enum CUjit_target/CUjit_target_enum cuda2hipRename["CUjit_target"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) @@ -711,6 +716,11 @@ struct cuda2hipMap { cuda2hipRename["CU_TARGET_COMPUTE_37"] = {"hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_TARGET_COMPUTE_50"] = {"hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_TARGET_COMPUTE_52"] = {"hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] + cuda2hipRename["CU_TARGET_COMPUTE_53"] = {"hipJitTargetCompute53", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_60"] = {"hipJitTargetCompute60", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_61"] = {"hipJitTargetCompute61", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_62"] = {"hipJitTargetCompute62", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // enum CUjitInputType/CUjitInputType_enum cuda2hipRename["CUjitInputType"] = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) cuda2hipRename["CUjitInputType_enum"] = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; @@ -831,6 +841,26 @@ struct cuda2hipMap { cuda2hipRename["CU_STREAM_DEFAULT"] = {"hipStreamDefault", CONV_STREAM, API_DRIVER}; cuda2hipRename["CU_STREAM_NON_BLOCKING"] = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER}; + // unsupported yet by HIP [CUDA 8.0.44] + // Flags for ::cuStreamWaitValue32 + cuda2hipRename["CUstreamWaitValue_flags"] = {"hipStreamWaitValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUstreamWaitValue_flags_enum"] = {"hipStreamWaitValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_STREAM_WAIT_VALUE_GEQ"] = {"hipStreamWaitValueGeq", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x0 + cuda2hipRename["CU_STREAM_WAIT_VALUE_EQ"] = {"hipStreamWaitValueEq", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x1 + cuda2hipRename["CU_STREAM_WAIT_VALUE_AND"] = {"hipStreamWaitValueAnd", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x2 + cuda2hipRename["CU_STREAM_WAIT_VALUE_FLUSH"] = {"hipStreamWaitValueFlush", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 1<<30 + // Flags for ::cuStreamWriteValue32 + cuda2hipRename["CUstreamWriteValue_flags"] = {"hipStreamWriteValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUstreamWriteValue_flags"] = {"hipStreamWriteValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_STREAM_WRITE_VALUE_DEFAULT"] = {"hipStreamWriteValueDefault", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x0 + cuda2hipRename["CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER"] = {"hipStreamWriteValueNoMemoryBarrier", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x1 + // Flags for ::cuStreamBatchMemOp + cuda2hipRename["CUstreamBatchMemOpType"] = {"hipStreamBatchMemOpType", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUstreamBatchMemOpType_enum"] = {"hipStreamBatchMemOpType", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_STREAM_MEM_OP_WAIT_VALUE_32"] = {"hipStreamBatchMemOpWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_STREAM_MEM_OP_WRITE_VALUE_32"] = {"hipStreamBatchMemOpWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES"] = {"hipStreamBatchMemOpFlushRemoteWrites", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 3 + // Init cuda2hipRename["cuInit"] = {"hipInit", CONV_DRIVER, API_DRIVER}; From 1cc5f42e34013e69363de5d34bc13cb40c394239 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 30 May 2017 18:29:14 +0300 Subject: [PATCH 112/171] [HIPIFY] Add the rest CUDA Driver API 8.0.44 Data structures. + Memory advise values + Memory Range Attributes + P2P Attributes P.S. There is no any new changes in CUDA Driver API 8.0.61 Data structures since 8.0.44. --- hipify-clang/src/Cuda2Hip.cpp | 47 +++++++++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index b163c4d20a..512144f3b9 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -431,17 +431,36 @@ struct cuda2hipMap { cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaComputeModeExclusive = 1) cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaComputeModeProhibited = 2) cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaComputeModeExclusiveProcess = 3) + + // unsupported yet by HIP [CUDA 8.0.44] + // Memory advise values + cuda2hipRename["CUmem_advise"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUmem_advise_enum"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_MEM_ADVISE_SET_READ_MOSTLY"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_MEM_ADVISE_UNSET_READ_MOSTLY"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_MEM_ADVISE_SET_PREFERRED_LOCATION"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 + cuda2hipRename["CU_MEM_ADVISE_SET_ACCESSED_BY"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 5 + cuda2hipRename["CU_MEM_ADVISE_UNSET_ACCESSED_BY"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 6 + // CUmem_range_attribute + cuda2hipRename["CUmem_range_attribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUmem_range_attribute_enum"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 + // Context flags - cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 - cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f + cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 + cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f // Defines cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; // ((void*)0x01) @@ -908,6 +927,14 @@ struct cuda2hipMap { cuda2hipRename["cuDeviceComputeCapability"] = {"hipDeviceComputeCapability", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceCanAccessPeer"] = {"hipDeviceCanAccessPeer", CONV_DEV, API_DRIVER}; + // unsupported yet by HIP [CUDA 8.0.44] + // P2P Attributes + cuda2hipRename["CUdevice_P2PAttribute"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUdevice_P2PAttribute_enum"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + // Events // pointer to CUevent_st cuda2hipRename["CUevent"] = {"hipEvent_t", CONV_TYPE, API_DRIVER}; From 063539308ec68e7c45353147e696b028478bf99e Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 30 May 2017 19:45:59 +0300 Subject: [PATCH 113/171] [HIPIFY] Add the rest CUDA Runtime API 8.0.44 Data structures. + sync with corresponding CUDA Driver API Data structures. P.S. There is no any new changes in CUDA Runtime API 8.0.61 Data structures since 8.0.44. --- hipify-clang/src/Cuda2Hip.cpp | 269 +++++++++++++++++++--------------- 1 file changed, 147 insertions(+), 122 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 512144f3b9..59d05e69f7 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -205,7 +205,6 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_ARRAY"] = {"hipErrorNotMappedAsArray", CONV_ERR, API_DRIVER}; // 212 cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_POINTER"] = {"hipErrorNotMappedAsPointer", CONV_ERR, API_DRIVER}; // 213 cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_IN_USE"] = {"hipErrorContextAlreadyInUse", CONV_ERR, API_DRIVER}; // 216 - cuda2hipRename["CUDA_ERROR_NVLINK_UNCORRECTABLE"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_DRIVER}; // 220 [CUDA 8.0.44] cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 @@ -325,6 +324,9 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_DRIVER}; // 219 cuda2hipRename["cudaErrorInvalidGraphicsContext"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 79 + cuda2hipRename["CUDA_ERROR_NVLINK_UNCORRECTABLE"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 220 [CUDA 8.0.44] + cuda2hipRename["cudaErrorNvlinkUncorrectable"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 80 [CUDA 8.0.44] + cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_DRIVER}; // 302 cuda2hipRename["cudaErrorSharedObjectSymbolNotFound"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 40 @@ -434,21 +436,21 @@ struct cuda2hipMap { // unsupported yet by HIP [CUDA 8.0.44] // Memory advise values - cuda2hipRename["CUmem_advise"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUmem_advise"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) // cuda2hipRename["CUmem_advise_enum"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_MEM_ADVISE_SET_READ_MOSTLY"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_MEM_ADVISE_UNSET_READ_MOSTLY"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_MEM_ADVISE_SET_PREFERRED_LOCATION"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 - cuda2hipRename["CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 - cuda2hipRename["CU_MEM_ADVISE_SET_ACCESSED_BY"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 5 - cuda2hipRename["CU_MEM_ADVISE_UNSET_ACCESSED_BY"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 6 + cuda2hipRename["CU_MEM_ADVISE_SET_READ_MOSTLY"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaMemAdviseSetReadMostly = 1) + cuda2hipRename["CU_MEM_ADVISE_UNSET_READ_MOSTLY"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetReadMostly = 2) + cuda2hipRename["CU_MEM_ADVISE_SET_PREFERRED_LOCATION"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaMemAdviseSetPreferredLocation = 3) + cuda2hipRename["CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetPreferredLocation = 4) + cuda2hipRename["CU_MEM_ADVISE_SET_ACCESSED_BY"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_RUNTIME ANALOGUE (cudaMemAdviseSetAccessedBy = 5) + cuda2hipRename["CU_MEM_ADVISE_UNSET_ACCESSED_BY"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetAccessedBy = 6) // CUmem_range_attribute - cuda2hipRename["CUmem_range_attribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUmem_range_attribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaMemRangeAttribute) // cuda2hipRename["CUmem_range_attribute_enum"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeReadMostly = 1) + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaMemRangeAttributePreferredLocation = 2) + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeAccessedBy = 3) + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeLastPrefetchLocation = 4) // Context flags cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; @@ -502,111 +504,111 @@ struct cuda2hipMap { cuda2hipRename["CUarray"] = {"hipArray *", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaArray_t) // unsupported yet by HIP - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimY = 3) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimZ = 4) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimX =5) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimY = 6) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimZ - 7) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerBlock = 8) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimY = 3) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimZ = 4) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimX =5) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimY = 6) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimZ - 7) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerBlock = 8) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 9 // API_Runtime ANALOGUE (cudaDevAttrTotalConstantMemory = 9) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 10 // API_Runtime ANALOGUE (cudaDevAttrWarpSize = 10) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 11 // API_Runtime ANALOGUE (cudaDevAttrMaxPitch = 11) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 12) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 13 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 13) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 14 // API_Runtime ANALOGUE (cudaDevAttrTextureAlignment = 14) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 9 // API_Runtime ANALOGUE (cudaDevAttrTotalConstantMemory = 9) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 10 // API_Runtime ANALOGUE (cudaDevAttrWarpSize = 10) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 11 // API_Runtime ANALOGUE (cudaDevAttrMaxPitch = 11) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 12) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 13 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 13) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 14 // API_Runtime ANALOGUE (cudaDevAttrTextureAlignment = 14) // Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 15 // API_Runtime ANALOGUE (cudaDevAttrGpuOverlap = 15) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 16 // API_Runtime ANALOGUE (cudaDevAttrMultiProcessorCount = 16) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 17 // API_Runtime ANALOGUE (cudaDevAttrKernelExecTimeout = 17) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 18 // API_Runtime ANALOGUE (cudaDevAttrIntegrated = 18) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 19 // API_Runtime ANALOGUE (cudaDevAttrCanMapHostMemory = 19) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER}; // 20 // API_Runtime ANALOGUE (cudaDevAttrComputeMode = 20) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 21 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DWidth = 21) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 22 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DWidth = 22) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 23 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DHeight = 23) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 24 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidth = 24) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 25 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeight = 25) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 26 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepth = 26) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 15 // API_Runtime ANALOGUE (cudaDevAttrGpuOverlap = 15) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 16 // API_Runtime ANALOGUE (cudaDevAttrMultiProcessorCount = 16) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 17 // API_Runtime ANALOGUE (cudaDevAttrKernelExecTimeout = 17) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 18 // API_Runtime ANALOGUE (cudaDevAttrIntegrated = 18) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 19 // API_Runtime ANALOGUE (cudaDevAttrCanMapHostMemory = 19) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER}; // 20 // API_Runtime ANALOGUE (cudaDevAttrComputeMode = 20) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 21 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DWidth = 21) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 22 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DWidth = 22) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 23 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DHeight = 23) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 24 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidth = 24) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 25 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeight = 25) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 26 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepth = 26) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 30 // API_Runtime ANALOGUE (cudaDevAttrSurfaceAlignment = 30) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER}; // 31 // API_Runtime ANALOGUE (cudaDevAttrConcurrentKernels = 31) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 32 // API_Runtime ANALOGUE (cudaDevAttrEccEnabled = 32) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER}; // 33 // API_Runtime ANALOGUE (cudaDevAttrPciBusId = 33) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER}; // 34 // API_Runtime ANALOGUE (cudaDevAttrPciDeviceId = 34) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 35 // API_Runtime ANALOGUE (cudaDevAttrTccDriver = 35) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 36 // API_Runtime ANALOGUE (cudaDevAttrMemoryClockRate = 36) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER}; // 37 // API_Runtime ANALOGUE (cudaDevAttrGlobalMemoryBusWidth = 37) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER}; // 38 // API_Runtime ANALOGUE (cudaDevAttrL2CacheSize = 38) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER}; // 39 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerMultiProcessor = 39) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 40 // API_Runtime ANALOGUE (cudaDevAttrAsyncEngineCount = 40) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 41 // API_Runtime ANALOGUE (cudaDevAttrUnifiedAddressing = 41) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 42 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredWidth = 42) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 43 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredLayers = 43) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 30 // API_Runtime ANALOGUE (cudaDevAttrSurfaceAlignment = 30) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER}; // 31 // API_Runtime ANALOGUE (cudaDevAttrConcurrentKernels = 31) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 32 // API_Runtime ANALOGUE (cudaDevAttrEccEnabled = 32) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER}; // 33 // API_Runtime ANALOGUE (cudaDevAttrPciBusId = 33) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER}; // 34 // API_Runtime ANALOGUE (cudaDevAttrPciDeviceId = 34) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 35 // API_Runtime ANALOGUE (cudaDevAttrTccDriver = 35) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 36 // API_Runtime ANALOGUE (cudaDevAttrMemoryClockRate = 36) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER}; // 37 // API_Runtime ANALOGUE (cudaDevAttrGlobalMemoryBusWidth = 37) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER}; // 38 // API_Runtime ANALOGUE (cudaDevAttrL2CacheSize = 38) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER}; // 39 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerMultiProcessor = 39) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 40 // API_Runtime ANALOGUE (cudaDevAttrAsyncEngineCount = 40) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 41 // API_Runtime ANALOGUE (cudaDevAttrUnifiedAddressing = 41) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 42 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredWidth = 42) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 43 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredLayers = 43) // deprecated, do not use - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 44 // API_Runtime ANALOGUE (no) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 45 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherWidth = 45) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 46 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherHeight = 46) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 47 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidthAlt = 47) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 48 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeightAlt = 48) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 49 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepthAlt = 49) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 50 // API_Runtime ANALOGUE (cudaDevAttrPciDomainId = 50) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 51 // API_Runtime ANALOGUE (cudaDevAttrTexturePitchAlignment = 51) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 52 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapWidth = 52) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 53 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredWidth = 53) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 54 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredLayers = 54) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 55 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DWidth = 55) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 56 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DWidth = 56) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 57 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DHeight = 57) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 58 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DWidth = 58) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 59 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DHeight = 59) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 60 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DDepth = 60) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 61 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredWidth = 61) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 62 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredLayers = 62) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 63 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredWidth = 63) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredHeight = 64) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 65 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredLayers = 65) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 66 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapWidth = 66) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 67 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 68 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 69 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLinearWidth = 69) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 70 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearWidth = 70) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 71 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearHeight = 71) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 72 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearPitch = 72) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 73 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedWidth = 73) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 74 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedHeight = 74) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER}; // 75 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMajor = 75) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER}; // 76 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMinor = 76) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 77 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DMipmappedWidth = 77) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 78 // API_Runtime ANALOGUE (cudaDevAttrStreamPrioritiesSupported = 78) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 79 // API_Runtime ANALOGUE (cudaDevAttrGlobalL1CacheSupported = 79) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 80 // API_Runtime ANALOGUE (cudaDevAttrLocalL1CacheSupported = 80) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER}; // 81 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 82 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerMultiprocessor = 82) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 83 // API_Runtime ANALOGUE (cudaDevAttrManagedMemory = 83) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER}; // 84 // API_Runtime ANALOGUE (cudaDevAttrIsMultiGpuBoard = 84) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 85 // API_Runtime ANALOGUE (cudaDevAttrMultiGpuBoardGroupID = 85) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 86 // API_Runtime ANALOGUE (no) - + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 44 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 45 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherWidth = 45) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 46 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherHeight = 46) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 47 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidthAlt = 47) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 48 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeightAlt = 48) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 49 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepthAlt = 49) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 50 // API_Runtime ANALOGUE (cudaDevAttrPciDomainId = 50) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 51 // API_Runtime ANALOGUE (cudaDevAttrTexturePitchAlignment = 51) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 52 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapWidth = 52) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 53 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredWidth = 53) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 54 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredLayers = 54) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 55 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DWidth = 55) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 56 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DWidth = 56) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 57 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DHeight = 57) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 58 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DWidth = 58) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 59 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DHeight = 59) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 60 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DDepth = 60) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 61 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredWidth = 61) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 62 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredLayers = 62) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 63 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredWidth = 63) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredHeight = 64) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 65 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredLayers = 65) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 66 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapWidth = 66) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 67 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 68 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 69 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLinearWidth = 69) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 70 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearWidth = 70) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 71 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearHeight = 71) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 72 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearPitch = 72) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 73 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedWidth = 73) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 74 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedHeight = 74) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER}; // 75 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMajor = 75) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER}; // 76 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMinor = 76) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 77 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DMipmappedWidth = 77) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 78 // API_Runtime ANALOGUE (cudaDevAttrStreamPrioritiesSupported = 78) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 79 // API_Runtime ANALOGUE (cudaDevAttrGlobalL1CacheSupported = 79) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 80 // API_Runtime ANALOGUE (cudaDevAttrLocalL1CacheSupported = 80) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER}; // 81 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 82 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerMultiprocessor = 82) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 83 // API_Runtime ANALOGUE (cudaDevAttrManagedMemory = 83) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER}; // 84 // API_Runtime ANALOGUE (cudaDevAttrIsMultiGpuBoard = 84) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 85 // API_Runtime ANALOGUE (cudaDevAttrMultiGpuBoardGroupID = 85) // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 86 // API_Runtime ANALOGUE (cudaDevAttrHostNativeAtomicSupported = 86) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 87 // API_Runtime ANALOGUE (cudaDevAttrSingleToDoublePrecisionPerfRatio = 87) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 88 // API_Runtime ANALOGUE (cudaDevAttrPageableMemoryAccess = 88) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 89 // API_Runtime ANALOGUE (cudaDevAttrConcurrentManagedAccess = 89) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 90 // API_Runtime ANALOGUE (cudaDevAttrComputePreemptionSupported = 90) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 91 // API_Runtime ANALOGUE (cudaDevAttrCanUseHostPointerForRegisteredMem = 91) + + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 92 // API_Runtime ANALOGUE (no) cuda2hipRename["CUdevprop_st"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUdevprop"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; @@ -929,11 +931,11 @@ struct cuda2hipMap { // unsupported yet by HIP [CUDA 8.0.44] // P2P Attributes - cuda2hipRename["CUdevice_P2PAttribute"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUdevice_P2PAttribute"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaDeviceP2PAttr) // cuda2hipRename["CUdevice_P2PAttribute_enum"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaDevP2PAttrPerformanceRank = 0x01) + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaDevP2PAttrAccessSupported = 0x02) + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaDevP2PAttrNativeAtomicSupported = 0x03) // Events // pointer to CUevent_st @@ -1352,12 +1354,12 @@ struct cuda2hipMap { cuda2hipRename["cudaDevAttrMultiGpuBoardGroupID"] = {"hipDeviceAttributeMultiGpuBoardGroupID", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 85 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85) // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrPageableMemoryAccess"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrConcurrentManagedAccess"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrComputePreemptionSupported"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 86 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86) + cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 87 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87) + cuda2hipRename["cudaDevAttrPageableMemoryAccess"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 88 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88) + cuda2hipRename["cudaDevAttrConcurrentManagedAccess"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 89 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89) + cuda2hipRename["cudaDevAttrComputePreemptionSupported"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 90 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90) + cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 91 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91) // Pointer Attributes // struct cudaPointerAttributes @@ -1375,6 +1377,13 @@ struct cuda2hipMap { cuda2hipRename["cudaDeviceGetStreamPriorityRange"] = {"hipDeviceGetStreamPriorityRange", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaSetValidDevices"] = {"hipSetValidDevices", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] + // P2P Attributes + cuda2hipRename["cudaDeviceP2PAttr"] = {"hipDeviceP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (CUdevice_P2PAttribute) + cuda2hipRename["cudaDevP2PAttrPerformanceRank"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01) + cuda2hipRename["cudaDevP2PAttrAccessSupported"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02) + cuda2hipRename["cudaDevP2PAttrNativeAtomicSupported"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03) + // Compute mode cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_DEFAULT = 0) @@ -1538,7 +1547,7 @@ struct cuda2hipMap { cuda2hipRename["cudaResourceTypeLinear"] = {"hipResourceTypeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_LINEAR = 0x02) cuda2hipRename["cudaResourceTypePitch2D"] = {"hipResourceTypePitch2D", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_PITCH2D = 0x03) - + // enum cudaResourceViewFormat cuda2hipRename["cudaResourceViewFormat"] = {"hipResourceViewFormat", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUresourceViewFormat) cuda2hipRename["cudaResViewFormatNone"] = {"hipResViewFormatNone", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_NONE = 0x00) cuda2hipRename["cudaResViewFormatUnsignedChar1"] = {"hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01) @@ -1582,6 +1591,22 @@ struct cuda2hipMap { cuda2hipRename["cudaAddressModeMirror"] = {"hipAddressModeMirror", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaAddressModeBorder"] = {"hipAddressModeBorder", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] + // Memory advise values + cuda2hipRename["cudaMemoryAdvise"] = {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_advise) + cuda2hipRename["cudaMemAdviseSetReadMostly"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_READ_MOSTLY = 1) + cuda2hipRename["cudaMemAdviseUnsetReadMostly"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2) + cuda2hipRename["cudaMemAdviseSetPreferredLocation"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3) + cuda2hipRename["cudaMemAdviseUnsetPreferredLocation"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4) + cuda2hipRename["cudaMemAdviseSetAccessedBy"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 5 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_ACCESSED_BY = 5) + cuda2hipRename["cudaMemAdviseUnsetAccessedBy"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 6 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6) + // CUmem_range_attribute + cuda2hipRename["cudaMemRangeAttribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_range_attribute) + cuda2hipRename["cudaMemRangeAttributeReadMostly"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1) + cuda2hipRename["cudaMemRangeAttributePreferredLocation"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2) + cuda2hipRename["cudaMemRangeAttributeAccessedBy"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3) + cuda2hipRename["cudaMemRangeAttributeLastPrefetchLocation"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4) + // functions cuda2hipRename["cudaCreateTextureObject"] = {"hipCreateTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaDestroyTextureObject"] = {"hipDestroyTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; From cb60763737049bdd35de7f31078a2e0e503e2941 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 30 May 2017 21:54:33 -0500 Subject: [PATCH 114/171] Set event->_stream on hipHccModuleLaunchKernel path if start/stop used Ensure _stream is always non-null in recorded events. Fixes isDefaultStream fault. --- src/hip_event.cpp | 4 +++- src/hip_hcc_internal.h | 3 ++- src/hip_module.cpp | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/hip_event.cpp b/src/hip_event.cpp index 2c31769718..8ef652489a 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -42,11 +42,13 @@ ihipEvent_t::ihipEvent_t(unsigned flags) // Attach to an existing completion future: -void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType) +void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, + hipStream_t stream, ihipEventType_t eventType) { _state = hipEventStatusRecording; _marker = *cf; _type = eventType; + _stream = stream; } diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 94ad4f9340..b15d5a73e4 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -538,6 +538,7 @@ public: const ihipDevice_t * getDevice() const; ihipCtx_t * getCtx() const; + // Before calling this function, stream must be resolved from "0" to the actual stream: bool isDefaultStream() const { return _id == 0; }; public: @@ -602,7 +603,7 @@ enum ihipEventType_t { class ihipEvent_t { public: ihipEvent_t(unsigned flags); - void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType); + void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType); void setTimestamp(); uint64_t timestamp() const { return _timestamp; } ; ihipEventType_t type() const { return _type; }; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index d364a6b519..2a3bfabc28 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -455,10 +455,10 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, if (startEvent) { - startEvent->attachToCompletionFuture(&cf, hipEventTypeStartCommand); + startEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStartCommand); } if (stopEvent) { - stopEvent->attachToCompletionFuture (&cf, hipEventTypeStopCommand); + stopEvent->attachToCompletionFuture (&cf, hStream, hipEventTypeStopCommand); } From dfcba01db602b8481535b9baad7cd8caa803fd18 Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 4 May 2017 13:57:01 +0530 Subject: [PATCH 115/171] Print msg for single gpu Change-Id: I2d23c73542add8973990ba96592016726994422e --- samples/2_Cookbook/8_peer2peer/peer2peer.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp index 990599e1cb..0f532a2f0a 100644 --- a/samples/2_Cookbook/8_peer2peer/peer2peer.cpp +++ b/samples/2_Cookbook/8_peer2peer/peer2peer.cpp @@ -55,13 +55,9 @@ void checkPeer2PeerSupport() { int gpuCount; int canAccessPeer; - int p2pCapableDeviceCount=0; HIPCHECK(hipGetDeviceCount(&gpuCount)); - if (gpuCount < 2) - printf("Peer2Peer application requires atleast 2 gpu devices"); - for (int currentGpu=0; currentGpu Date: Thu, 11 May 2017 11:30:49 +0530 Subject: [PATCH 116/171] Add unroll and inline asm cookbook samples Change-Id: Ie5a0fbb01b7fca82959090d89299533d49e092f1 --- samples/2_Cookbook/10_inline_asm/Makefile | 35 ++++ .../2_Cookbook/10_inline_asm/inline_asm.cpp | 174 ++++++++++++++++++ samples/2_Cookbook/9_unroll/Makefile | 39 ++++ samples/2_Cookbook/9_unroll/unroll.cpp | 141 ++++++++++++++ 4 files changed, 389 insertions(+) create mode 100644 samples/2_Cookbook/10_inline_asm/Makefile create mode 100644 samples/2_Cookbook/10_inline_asm/inline_asm.cpp create mode 100644 samples/2_Cookbook/9_unroll/Makefile create mode 100644 samples/2_Cookbook/9_unroll/unroll.cpp diff --git a/samples/2_Cookbook/10_inline_asm/Makefile b/samples/2_Cookbook/10_inline_asm/Makefile new file mode 100644 index 0000000000..77a7699635 --- /dev/null +++ b/samples/2_Cookbook/10_inline_asm/Makefile @@ -0,0 +1,35 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = inline_asm.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./inline_asm + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o diff --git a/samples/2_Cookbook/10_inline_asm/inline_asm.cpp b/samples/2_Cookbook/10_inline_asm/inline_asm.cpp new file mode 100644 index 0000000000..2b4fc3de90 --- /dev/null +++ b/samples/2_Cookbook/10_inline_asm/inline_asm.cpp @@ -0,0 +1,174 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define WIDTH 1024 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x])); +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width) +{ + for(unsigned int j=0; j < width; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*width + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + float eventMs = 1.0f; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("kernel Execution time = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { + printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/samples/2_Cookbook/9_unroll/Makefile b/samples/2_Cookbook/9_unroll/Makefile new file mode 100644 index 0000000000..b71f3d8353 --- /dev/null +++ b/samples/2_Cookbook/9_unroll/Makefile @@ -0,0 +1,39 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET))) + $(error gfx701 is not a supported device for this sample) +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = unroll.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./unroll + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o diff --git a/samples/2_Cookbook/9_unroll/unroll.cpp b/samples/2_Cookbook/9_unroll/unroll.cpp new file mode 100644 index 0000000000..22f1c75e6e --- /dev/null +++ b/samples/2_Cookbook/9_unroll/unroll.cpp @@ -0,0 +1,141 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + + +#define WIDTH 4 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + float val = in[x]; + +#pragma unroll + for(int i=0;i eps ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} From 830f2b100d59d6cc58301db68d55f01981f8fbde Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 11 May 2017 16:33:31 +0530 Subject: [PATCH 117/171] Add inline asm hip directed tests for v_add and v_mac Change-Id: Ie5ace2e42d5da89b16e040537df2bb13d3883c6d --- tests/src/kernel/inline_asm_vadd.cpp | 126 +++++++++++++++++++++++++++ tests/src/kernel/inline_asm_vmac.cpp | 125 ++++++++++++++++++++++++++ 2 files changed, 251 insertions(+) create mode 100644 tests/src/kernel/inline_asm_vadd.cpp create mode 100644 tests/src/kernel/inline_asm_vmac.cpp diff --git a/tests/src/kernel/inline_asm_vadd.cpp b/tests/src/kernel/inline_asm_vadd.cpp new file mode 100644 index 0000000000..481b606e89 --- /dev/null +++ b/tests/src/kernel/inline_asm_vadd.cpp @@ -0,0 +1,126 @@ +/* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the +following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT +LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %t %s + * RUN: %t + * HIT_END + */ + + +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define NUM 1024 + +#define THREADS_PER_BLOCK_X 4 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void vadd_asm(hipLaunchParm lp, + float *out, + float *in) +{ + int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + + asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i])); +} + +// CPU implementation of Vector Result +void addCPUReference( + float * output, + float * input) +{ + for(unsigned int j=0; j < NUM; j++) + { + + output[j]= input[j] + output[j]; + } +} + +int main(){ + + float* VectorA; + float* ResultVector; + float* VectorB; + + float* gpuVector; + float* gpuResultVector; + + int i; + int errors; + + VectorA = (float*)malloc(NUM * sizeof(float)); + ResultVector = (float*)malloc(NUM * sizeof(float)); + VectorB = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + VectorA[i] = (float)i*10.0f; + VectorB[i] = (float)i*30.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuVector, NUM * sizeof(float)); + hipMalloc((void**)&gpuResultVector, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(vadd_asm, + dim3(NUM/THREADS_PER_BLOCK_X), + dim3(THREADS_PER_BLOCK_X), + 0, 0, + gpuResultVector , gpuVector); + + // Memory transfer from device to host + hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU Result computation + addCPUReference(VectorB, VectorA); + + // verify the results + errors = 0; + double eps = 1.0E-3; + for (i = 0; i < NUM; i++) { + if (std::abs(ResultVector[i] - VectorB[i]) > eps ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuVector); + hipFree(gpuResultVector); + + hipDeviceReset(); + + //free the resources on host side + free(VectorA); + free(ResultVector); + free(VectorB); + + return errors; +} diff --git a/tests/src/kernel/inline_asm_vmac.cpp b/tests/src/kernel/inline_asm_vmac.cpp new file mode 100644 index 0000000000..1b6941c249 --- /dev/null +++ b/tests/src/kernel/inline_asm_vmac.cpp @@ -0,0 +1,125 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define NUM 1024 + +#define THREADS_PER_BLOCK_X 4 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void vmac_asm(hipLaunchParm lp, + float *out, + float *in) +{ + int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + + asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i])); +} + +// CPU implementation of saxpy +void CPUReference( + float * output, + float * input) +{ + for(unsigned int j=0; j < NUM; j++) + { + + output[j]= a*input[j] + output[j]; + } +} + +int main(){ + + float* VectorA; + float* ResultVector; + float* VectorB; + + float* gpuVector; + float* gpuResultVector; + + const float a = 10.0f + int i; + int errors; + + VectorA = (float*)malloc(NUM * sizeof(float)); + ResultVector = (float*)malloc(NUM * sizeof(float)); + VectorB = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + VectorA[i] = (float)i*10.0f; + VectorB[i] = (float)i*30.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuVector, NUM * sizeof(float)); + hipMalloc((void**)&gpuResultVector, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(vmac_asm, + dim3(NUM/THREADS_PER_BLOCK_X), + dim3(THREADS_PER_BLOCK_X), + 0, 0, + gpuResultVector , gpuVector); + + // Memory transfer from device to host + hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU Result computation + addCPUReference(VectorB, VectorA); + + // verify the results + errors = 0; + double eps = 1.0E-3; + for (i = 0; i < NUM; i++) { + if (std::abs(ResultVector[i] - VectorB[i]) > eps ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuVector); + hipFree(gpuResultVector); + + hipDeviceReset(); + + //free the resources on host side + free(VectorA); + free(ResultVector); + free(VectorB); + + return errors; +} From 0154c97ddd61151b2b4978b83f9b9551a34b11cb Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 11 May 2017 18:43:24 +0530 Subject: [PATCH 118/171] Add readme for inline asm and unroll cookbook samples Change-Id: I71b7a5652c3dad181c5df60ab0dd1b81d79f1bfb --- samples/2_Cookbook/10_inline_asm/Readme.md | 47 +++++++++++++++++++++ samples/2_Cookbook/9_unroll/Readme.md | 48 ++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 samples/2_Cookbook/10_inline_asm/Readme.md create mode 100644 samples/2_Cookbook/9_unroll/Readme.md diff --git a/samples/2_Cookbook/10_inline_asm/Readme.md b/samples/2_Cookbook/10_inline_asm/Readme.md new file mode 100644 index 0000000000..8c98547220 --- /dev/null +++ b/samples/2_Cookbook/10_inline_asm/Readme.md @@ -0,0 +1,47 @@ +## inline asm ### + +This tutorial is about how to use inline GCN asm in kernel. In this tutorial, we'll explain how to by using the simple Matrix Transpose. + +## Introduction: + +If you want to take advantage of the extra performance benefits of writing in assembly as well as take advantage of special GPU hardware features that were only available through assemby, then this tutorial is for you. In this tutorial we'll be explaining how to start writing inline asm in kernel. + +For more insight Please read the following blogs by Ben Sander +[The Art of AMDGCN Assembly: How to Bend the Machine to Your Will](gpuopen.com/amdgcn-assembly) +[AMD GCN Assembly: Cross-Lane Operations](http://gpuopen.com/amd-gcn-assembly-cross-lane-operations/) + +For more information: +[AMD GCN3 ISA Architecture Manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/) +[User Guide for AMDGPU Back-end](llvm.org/docs/AMDGPUUsage.html) + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the our very first tutorial. + +## asm() Assembler statement + +We insert the GCN isa into the kernel using asm() Assembler statement. In the same sourcecode, we used for MatrixTranspose. We'll add the following: + +` asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x])); ` + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/samples/2_Cookbook/9_unroll/Readme.md b/samples/2_Cookbook/9_unroll/Readme.md new file mode 100644 index 0000000000..3c2635c0eb --- /dev/null +++ b/samples/2_Cookbook/9_unroll/Readme.md @@ -0,0 +1,48 @@ +## Using Pragma unroll ### + +In this tutorial, we'll explain how to use #pragma unroll to improve the performance. + +## Introduction: + +Loop unrolling optimization hints can be specified with #pragma unroll and #pragma nounroll. The pragma is placed immediately before a for loop. +Specifying #pragma unroll without a parameter directs the loop unroller to attempt to fully unroll the loop if the trip count is known at compile time and attempt to partially unroll the loop if the trip count is not known at compile time. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +For this tutorial we will be using MatrixTranspose with shfl operation i.e., our 4_shfl tutorial since it is the only examples where we used loops inside the kernel. + +In this tutorial, we'll use `#pragma unroll`. In the same sourcecode, we used for MatrixTranspose. We'll add it just before the for loop as following: + +`#pragma unroll ` +` for(int i=0;i Date: Wed, 31 May 2017 10:15:41 +0530 Subject: [PATCH 119/171] Disable rcbrt, scalbln and scalbn double precision device test Change-Id: I46bd895701c46d3592b553090cafba99e41a2e2d --- tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp b/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp index df5dad3968..f4f7ab0479 100644 --- a/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp +++ b/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp @@ -99,7 +99,7 @@ __device__ void double_precision_math_functions() normcdf(0.0); normcdfinv(1.0); pow(1.0, 0.0); - rcbrt(1.0); + //rcbrt(1.0); remainder(2.0, 1.0); // remquo(1.0, 2.0, &iX); rhypot(0.0, 1.0); @@ -109,8 +109,8 @@ __device__ void double_precision_math_functions() rnorm4d(0.0, 0.0, 0.0, 1.0); round(0.0); rsqrt(1.0); - scalbln(0.0, 1); - scalbn(0.0, 1); + //scalbln(0.0, 1); + //scalbn(0.0, 1); signbit(1.0); sin(0.0); sincos(0.0, &fX, &fY); From 5cdd1b2bf5d74dafc72a7b3f9bd8be573bef3b57 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 31 May 2017 10:16:19 +0530 Subject: [PATCH 120/171] Disable rcbrtf, scalblnf, scalbnf in single precision device test Change-Id: I8a250a64a0cb05132d022a11d9766ced9cdf11a7 --- tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp b/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp index 53ccd2251f..de3dec35ef 100644 --- a/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp +++ b/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp @@ -100,7 +100,7 @@ __device__ void single_precision_math_functions() normcdfinvf(1.0f); fX = 1.0f; normf(1, &fX); powf(1.0f, 0.0f); - rcbrtf(1.0f); + //rcbrtf(1.0f); remainderf(2.0f, 1.0f); //remquof(1.0f, 2.0f, &iX); rhypotf(0.0f, 1.0f); @@ -110,8 +110,8 @@ __device__ void single_precision_math_functions() fX = 1.0f; rnormf(1, &fX); roundf(0.0f); rsqrtf(1.0f); - scalblnf(0.0f, 1); - scalbnf(0.0f, 1); + //scalblnf(0.0f, 1); + //scalbnf(0.0f, 1); signbit(1.0f); sincosf(0.0f, &fX, &fY); sincospif(0.0f, &fX, &fY); From 502a74fcd678635e59f3d7f16d95c1f000e39343 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 31 May 2017 10:16:57 +0530 Subject: [PATCH 121/171] Fix hipMemoryAllocate test for single GPU Change-Id: If121c18ab490ba125dc689ffc08a8839fd280c38 --- tests/src/runtimeApi/memory/hipMemoryAllocate.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index 1ee5cbc9bb..34951f0a09 100644 --- a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -36,7 +36,6 @@ void multiGpuHostAlloc(int allocDevice) int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); - assert(numDevices > 1); printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices); @@ -121,10 +120,12 @@ int main(int argc, char *argv[]) { int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); - assert(numDevices > 1); multiGpuHostAlloc(0); - multiGpuHostAlloc(1); + if (numDevices > 1) + { + multiGpuHostAlloc(1); + } } passed(); From 6e99e388ea171837ae0a3c7be39b5bfd02d6f99d Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 31 May 2017 18:55:29 +0300 Subject: [PATCH 122/171] [HIP] [HIPIFY] CUDA Driver API 8.0.44 JIT options support. --- hipify-clang/src/Cuda2Hip.cpp | 4 ++-- include/hip/hcc_detail/hip_runtime_api.h | 2 ++ include/hip/nvcc_detail/hip_runtime_api.h | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 59d05e69f7..9c22fde573 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -718,8 +718,8 @@ struct cuda2hipMap { cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER}; // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["CU_JIT_NEW_SM3X_OPT"] = {"hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_FAST_COMPILE"] = {"hipJitOptionFastCompile", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_NEW_SM3X_OPT"] = {"hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_FAST_COMPILE"] = {"hipJitOptionFastCompile", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER}; // enum CUjit_target/CUjit_target_enum diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 6059e1e92d..25eac31ec6 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -158,6 +158,8 @@ typedef enum hipJitOption { hipJitOptionLogVerbose, hipJitOptionGenerateLineInfo, hipJitOptionCacheMode, + hipJitOptionSm3xOpt, + hipJitOptionFastCompile, hipJitOptionNumOptions } hipJitOption; diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index b09c9323c7..f92523a3e3 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -94,6 +94,8 @@ hipMemcpyHostToHost #define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE #define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO #define hipJitOptionCacheMode CU_JIT_CACHE_MODE +#define hipJitOptionSm3xOpt CU_JIT_NEW_SM3X_OPT +#define hipJitOptionFastCompile CU_JIT_FAST_COMPILE #define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS typedef cudaEvent_t hipEvent_t; From 741eb844fe5ffb4bab7e73820ba710672763fe66 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Wed, 31 May 2017 15:19:26 -0400 Subject: [PATCH 123/171] fix atomicCAS:remove load for the return value after CAS --- src/device_util.cpp | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/device_util.cpp b/src/device_util.cpp index bea42aba46..e59a44e5ba 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include "device_util.h" #include "hip/hcc_detail/device_functions.h" #include "hip/hip_runtime.h" +#include //================================================================================================= /* @@ -923,24 +924,45 @@ __device__ unsigned long long int atomicMax(unsigned long long int* address, } //atomicCAS() +template +__device__ T atomicCAS_impl(T* address, T compare, T val) +{ + // the implementation assumes the atomic is lock-free and + // has the same size as the non-atmoic equivalent type + static_assert(sizeof(T) == sizeof(std::atomic) + , "size mismatch between atomic and non-atomic types"); + + union { + T* address; + std::atomic* atomic_address; + } u; + u.address = address; + + T expected = compare; + + // hcc should generate a system scope atomic CAS + std::atomic_compare_exchange_weak_explicit(u.atomic_address + , &expected, val + , std::memory_order_acq_rel + , std::memory_order_relaxed); + return expected; +} + __device__ int atomicCAS(int* address, int compare, int val) { - hc::atomic_compare_exchange(address,&compare,val); - return *address; + return atomicCAS_impl(address, compare, val); } __device__ unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) { - hc::atomic_compare_exchange(address,&compare,val); - return *address; + return atomicCAS_impl(address, compare, val); } __device__ unsigned long long int atomicCAS(unsigned long long int* address, unsigned long long int compare, unsigned long long int val) { - hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val); - return *address; + return atomicCAS_impl(address, compare, val); } //atomicAnd() From 4a5484c616cf8dde06fd7008c7582e04b039a92b Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 1 Jun 2017 21:08:33 +0300 Subject: [PATCH 124/171] [HIPIFY] All CUDA 8.0.44 API functions update (for both Driver and Runtime APIs) 1) P2P cuDeviceGetP2PAttribute cudaDeviceGetP2PAttribute 2) Memory Mngmnt cuMemPrefetchAsync cudaMemPrefetchAsync cuMemAdvise cudaMemAdvise cuMemRangeGetAttribute cudaMemRangeGetAttribute cuMemRangeGetAttributes cudaMemRangeGetAttributes 3) Streams (Driver API only, no analogues in Runtime API) cuStreamWaitValue32 cuStreamWaitValue32 cuStreamWriteValue32 4) Texture Reference Mngmnt (Driver API only, no analogues in Runtime API) cuTexRefSetBorderColor cuTexRefGetBorderColor --- hipify-clang/src/Cuda2Hip.cpp | 59 +++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 9c22fde573..0825285b51 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -937,6 +937,8 @@ struct cuda2hipMap { cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaDevP2PAttrAccessSupported = 0x02) cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaDevP2PAttrNativeAtomicSupported = 0x03) + cuda2hipRename["cuDeviceGetP2PAttribute"] = {"hipDeviceGetP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaDeviceGetP2PAttribute) + // Events // pointer to CUevent_st cuda2hipRename["CUevent"] = {"hipEvent_t", CONV_TYPE, API_DRIVER}; @@ -973,6 +975,9 @@ struct cuda2hipMap { // Streams // unsupported yet by HIP cuda2hipRename["cuStreamAddCallback"] = {"hipStreamAddCallback", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuStreamWaitValue32"] = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuStreamWriteValue32"] = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuStreamBatchMemOp"] = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE cuda2hipRename["cuStreamCreate"] = {"hipStreamCreate", CONV_STREAM, API_DRIVER}; cuda2hipRename["cuStreamDestroy_v2"] = {"hipStreamDestroy", CONV_STREAM, API_DRIVER}; @@ -1014,6 +1019,11 @@ struct cuda2hipMap { cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemPrefetchAsync"] = {"hipMemPrefetchAsync___", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature) + cuda2hipRename["cuMemAdvise"] = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemAdvise) + cuda2hipRename["cuMemRangeGetAttribute"] = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttribute) + cuda2hipRename["cuMemRangeGetAttributes"] = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttributes) + // Texture Reference Mngmnt // Texture reference filtering modes cuda2hipRename["CUfilter_mode"] = {"hipTextureFilterMode", CONV_TEX, API_DRIVER}; // API_Runtime ANALOGUE (cudaTextureFilterMode) @@ -1022,6 +1032,9 @@ struct cuda2hipMap { cuda2hipRename["CU_TR_FILTER_MODE_POINT"] = {"hipFilterModePoint", CONV_TEX, API_DRIVER}; // 0 // API_Runtime ANALOGUE (cudaFilterModePoint = 0) cuda2hipRename["CU_TR_FILTER_MODE_LINEAR"] = {"hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaFilterModeLinear = 1) + cuda2hipRename["cuTexRefSetBorderColor"] = {"hipTexRefSetBorderColor", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuTexRefGetBorderColor"] = {"hipTexRefGetBorderColor", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + // Profiler // unsupported yet by HIP cuda2hipRename["cuProfilerInitialize"] = {"hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED}; @@ -1111,6 +1124,25 @@ struct cuda2hipMap { cuda2hipRename["cudaMemcpyFromArrayAsync"] = {"hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpyFromSymbol"] = {"hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpyFromSymbolAsync"] = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemAdvise"] = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] + cuda2hipRename["cudaMemRangeGetAttribute"] = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] + cuda2hipRename["cudaMemRangeGetAttributes"] = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] + + // unsupported yet by HIP [CUDA 8.0.44] + // Memory advise values + cuda2hipRename["cudaMemoryAdvise"] = {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_advise) + cuda2hipRename["cudaMemAdviseSetReadMostly"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_READ_MOSTLY = 1) + cuda2hipRename["cudaMemAdviseUnsetReadMostly"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2) + cuda2hipRename["cudaMemAdviseSetPreferredLocation"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3) + cuda2hipRename["cudaMemAdviseUnsetPreferredLocation"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4) + cuda2hipRename["cudaMemAdviseSetAccessedBy"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 5 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_ACCESSED_BY = 5) + cuda2hipRename["cudaMemAdviseUnsetAccessedBy"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 6 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6) + // CUmem_range_attribute + cuda2hipRename["cudaMemRangeAttribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_range_attribute) + cuda2hipRename["cudaMemRangeAttributeReadMostly"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1) + cuda2hipRename["cudaMemRangeAttributePreferredLocation"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2) + cuda2hipRename["cudaMemRangeAttributeAccessedBy"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3) + cuda2hipRename["cudaMemRangeAttributeLastPrefetchLocation"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4) // memcpy kind cuda2hipRename["cudaMemcpyKind"] = {"hipMemcpyKind", CONV_MEM, API_RUNTIME}; @@ -1137,6 +1169,7 @@ struct cuda2hipMap { cuda2hipRename["cudaGetMipmappedArrayLevel"] = {"hipGetMipmappedArrayLevel", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaGetSymbolAddress"] = {"hipGetSymbolAddress", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaGetSymbolSize"] = {"hipGetSymbolSize", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaMemPrefetchAsync"] = {"hipMemPrefetchAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Driver ANALOGUE (cuMemPrefetchAsync) // malloc cuda2hipRename["cudaMalloc"] = {"hipMalloc", CONV_MEM, API_RUNTIME}; @@ -1379,10 +1412,12 @@ struct cuda2hipMap { // unsupported yet by HIP [CUDA 8.0.44] // P2P Attributes - cuda2hipRename["cudaDeviceP2PAttr"] = {"hipDeviceP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (CUdevice_P2PAttribute) - cuda2hipRename["cudaDevP2PAttrPerformanceRank"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01) - cuda2hipRename["cudaDevP2PAttrAccessSupported"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02) - cuda2hipRename["cudaDevP2PAttrNativeAtomicSupported"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03) + cuda2hipRename["cudaDeviceP2PAttr"] = {"hipDeviceP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUdevice_P2PAttribute) + cuda2hipRename["cudaDevP2PAttrPerformanceRank"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01) + cuda2hipRename["cudaDevP2PAttrAccessSupported"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02) + cuda2hipRename["cudaDevP2PAttrNativeAtomicSupported"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03) + // [CUDA 8.0.44] + cuda2hipRename["cudaDeviceGetP2PAttribute"] = {"hipDeviceGetP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (cuDeviceGetP2PAttribute) // Compute mode cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) @@ -1591,22 +1626,6 @@ struct cuda2hipMap { cuda2hipRename["cudaAddressModeMirror"] = {"hipAddressModeMirror", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaAddressModeBorder"] = {"hipAddressModeBorder", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - // unsupported yet by HIP [CUDA 8.0.44] - // Memory advise values - cuda2hipRename["cudaMemoryAdvise"] = {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_advise) - cuda2hipRename["cudaMemAdviseSetReadMostly"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_READ_MOSTLY = 1) - cuda2hipRename["cudaMemAdviseUnsetReadMostly"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2) - cuda2hipRename["cudaMemAdviseSetPreferredLocation"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3) - cuda2hipRename["cudaMemAdviseUnsetPreferredLocation"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4) - cuda2hipRename["cudaMemAdviseSetAccessedBy"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 5 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_ACCESSED_BY = 5) - cuda2hipRename["cudaMemAdviseUnsetAccessedBy"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 6 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6) - // CUmem_range_attribute - cuda2hipRename["cudaMemRangeAttribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_range_attribute) - cuda2hipRename["cudaMemRangeAttributeReadMostly"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1) - cuda2hipRename["cudaMemRangeAttributePreferredLocation"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2) - cuda2hipRename["cudaMemRangeAttributeAccessedBy"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3) - cuda2hipRename["cudaMemRangeAttributeLastPrefetchLocation"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4) - // functions cuda2hipRename["cudaCreateTextureObject"] = {"hipCreateTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaDestroyTextureObject"] = {"hipDestroyTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; From 4d035caedfdf02340a1e8140d0e16a033e7bb87a Mon Sep 17 00:00:00 2001 From: emankov Date: Fri, 2 Jun 2017 16:30:43 +0300 Subject: [PATCH 125/171] [HIPIFY] rename legacy hipify perl script and its usage to hipify-perl --- bin/hipconvertinplace-perl.sh | 10 +++++----- bin/hipexamine-perl.sh | 6 +++--- bin/{hipify => hipify-perl} | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) rename bin/{hipify => hipify-perl} (99%) diff --git a/bin/hipconvertinplace-perl.sh b/bin/hipconvertinplace-perl.sh index a8c8d6d9e8..d500cc14c6 100755 --- a/bin/hipconvertinplace-perl.sh +++ b/bin/hipconvertinplace-perl.sh @@ -1,18 +1,18 @@ #!/bin/bash -#usage : hipconvertinplace.sh [DIRNAME] [HIPIFY_OPTIONS] +#usage : hipconvertinplace-perl.sh DIRNAME [hipify-perl options] -#hipify "inplace" all code files in specified directory. +#hipify "inplace" all code files in specified directory. # This can be quite handy when dealing with an existing CUDA code base since the script # preserves the existing directory structure. # For each code file, this script will: -# - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip". Then Hipify the code file. +# - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file. # - If ".prehip" file exists, this is used as input to hipify. -# (this is useful for testing improvements to the hipify toolset). +# (this is useful for testing improvements to the hipify-perl toolset). SCRIPT_DIR=`dirname $0` SEARCH_DIR=$1 shift -$SCRIPT_DIR/hipify -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` +$SCRIPT_DIR/hipify-perl -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` diff --git a/bin/hipexamine-perl.sh b/bin/hipexamine-perl.sh index 40c1bf466d..9e0b01df44 100755 --- a/bin/hipexamine-perl.sh +++ b/bin/hipexamine-perl.sh @@ -1,12 +1,12 @@ #!/bin/bash -#usage : hipexamine.sh DIRNAME [hipify.pl options] +#usage : hipexamine.sh DIRNAME [hipify-perl options] -# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files +# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files # in the specified directory. SCRIPT_DIR=`dirname $0` SEARCH_DIR=$1 shift -$SCRIPT_DIR/hipify -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` +$SCRIPT_DIR/hipify-perl -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` diff --git a/bin/hipify b/bin/hipify-perl similarity index 99% rename from bin/hipify rename to bin/hipify-perl index 4d77fad3ed..27acc5bccc 100755 --- a/bin/hipify +++ b/bin/hipify-perl @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ## -#usage hipify [OPTIONS] INPUT_FILE +#usage hipify-perl [OPTIONS] INPUT_FILE use Getopt::Long; my $warn_whitelist =""; @@ -201,7 +201,7 @@ while (@ARGV) { my %ft; clearStats(\%ft, \@statNames); my $countIncludes = 0; - my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify but counted here. + my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify-perl, but counted here. my $warnings = 0; my $warningsCublas = 0; my $warningsCurand = 0; From ccc4cd1a3ebc423e4485d85ffec2d7775406690f Mon Sep 17 00:00:00 2001 From: emankov Date: Fri, 2 Jun 2017 16:33:48 +0300 Subject: [PATCH 126/171] [HIPIFY] annotation --- bin/hipexamine-perl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/hipexamine-perl.sh b/bin/hipexamine-perl.sh index 9e0b01df44..4e3a261aa4 100755 --- a/bin/hipexamine-perl.sh +++ b/bin/hipexamine-perl.sh @@ -1,6 +1,6 @@ #!/bin/bash -#usage : hipexamine.sh DIRNAME [hipify-perl options] +#usage : hipexamine-perl.sh DIRNAME [hipify-perl options] # Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files # in the specified directory. From cb7c4c423c8ec1fd835f0ae4119adc3cfd61b036 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 2 Jun 2017 11:19:33 -0500 Subject: [PATCH 127/171] added half data type and vector destructors 1. Added half data types to hip_fp16.h 2. Added destructor to vector data types Change-Id: Id5ae76a663bb90a4bde2839ec79c58fbaee5072f --- include/hip/hcc_detail/hip_fp16.h | 1 + include/hip/hcc_detail/hip_vector_types.h | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h index a1abce2191..b1ecc61cb0 100644 --- a/include/hip/hcc_detail/hip_fp16.h +++ b/include/hip/hcc_detail/hip_fp16.h @@ -28,6 +28,7 @@ THE SOFTWARE. typedef __fp16 __half; typedef __fp16 __half1 __attribute__((ext_vector_type(1))); typedef __fp16 __half2 __attribute__((ext_vector_type(2))); +typedef __fp16 half; /* Half Arithmetic Functions diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h index 3c3b26c12a..9da34d9f32 100644 --- a/include/hip/hcc_detail/hip_vector_types.h +++ b/include/hip/hcc_detail/hip_vector_types.h @@ -37,38 +37,41 @@ THE SOFTWARE. #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x) { } \ -__device__ __host__ type(const type& val) : x(val.x) { } +__device__ __host__ type(const type& val) : x(val.x) { } \ +__device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } +__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } \ +__device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } \ +__device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } - +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ +__device__ __host__ ~type() {} #define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val) {} \ #define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val), y(val) {} \ -__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} +__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} \ #define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} +__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} \ #define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} +__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} \ struct uchar1 { #ifdef __cplusplus From d5c161632476af9a18671393b948dfbc4635e3b6 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 3 Jun 2017 17:09:19 -0500 Subject: [PATCH 128/171] Update tests, add p2p coherency test. --- src/hip_hcc.cpp | 4 +- tests/src/runtimeApi/memory/hipHostMalloc.cpp | 17 ++ .../runtimeApi/memory/p2p_copy_coherency.cpp | 170 ++++++++++++++++++ 3 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 tests/src/runtimeApi/memory/p2p_copy_coherency.cpp diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 4400e4596e..5d8846da1e 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -84,6 +84,8 @@ int HIP_DENY_PEER_ACCESS = 0; // Force async copies to actually use the synchronous copy interface. int HIP_FORCE_SYNC_COPY = 0; +// TODO - set these to 0 and 1 +int HIP_EVENT_SYS_RELEASE=1; int HIP_COHERENT_HOST_ALLOC = 0; // TODO - set to 0 once we resolve stability. @@ -94,9 +96,9 @@ int HIP_SYNC_HOST_ALLOC = 1; int HIP_SYNC_NULL_STREAM = 1; // HIP needs to change some behavior based on HCC_OPT_FLUSH : +// TODO - set this to 1 int HCC_OPT_FLUSH = 0; -int HIP_EVENT_SYS_RELEASE=0; diff --git a/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 54073e4901..47baf5c206 100644 --- a/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -52,6 +52,8 @@ std::vector syncMsg = {"event", "stream", "device"}; void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg) { std::cerr << "test: CheckHostPointer " << msg + //<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC + //<< " HIP_EVENT_SYS_RELEASE=" << HIP_EVENT_SYS_RELEASE << " eventFlags = " << std::hex << eventFlags << ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "") << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") @@ -185,6 +187,21 @@ int main(){ } + // Check defaults: + if (1) { + int *A = nullptr; + HIPCHECK(hipHostMalloc((void**)&A, sizeBytes)); + const char *ptrType = "default"; + CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType); + + CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType); + } + + } diff --git a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp new file mode 100644 index 0000000000..459c0054c9 --- /dev/null +++ b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -0,0 +1,170 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Simple test for memset. +// Also serves as a template for other tests. + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * RUN: %t + * HIT_END + */ + +#include "hip/hip_runtime.h" +#include "test_common.h" + +#ifdef __HIP_PLATFORM_HCC__ +#include +#endif + +#define USE_HSA_COPY 1 + +int enablePeers(int dev0, int dev1) +{ + int canAccessPeer01, canAccessPeer10; + HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer01, dev0, dev1)); + HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer10, dev1, dev0)); + if (!canAccessPeer01 || !canAccessPeer10) { + return -1; + } + + HIPCHECK(hipSetDevice(dev0)); + HIPCHECK(hipDeviceEnablePeerAccess(dev1, 0/*flags*/)); + HIPCHECK(hipSetDevice(dev1)); + HIPCHECK(hipDeviceEnablePeerAccess(dev0, 0/*flags*/)); + + return 0; +}; + + +__global__ void +memsetIntKernel(int * ptr, int val, size_t numElements) +{ + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + if (gid < numElements) { + ptr[gid] = val; + } +}; + + +void checkReverse(const int *ptr, int numElements, int expected) { + for (int i=numElements-1; i>=0; i--) { + if (ptr[i] != expected) { + printf ("i=%d, ptr[](%d) != expected (%d)\n", i, ptr[i], expected); + assert (ptr[i] == expected); + } + } + + printf ("test: OK\n"); +} + + +void runTest(bool stepAIsCopy, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, + int * dataGpu0, int *dataGpu1, int *dataHost, int expected) +{ + hipEvent_t e; + HIPCHECK(hipEventCreateWithFlags(&e,0)); + + printf ("test: runTest with %s\n", stepAIsCopy ? "copy" : "kernel"); + const size_t sizeElements = numElements * sizeof(int); + + hipStream_t stepAStream = gpu0Stream; + + if (stepAIsCopy) { +#ifdef USE_HSA_COPY + HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0, sizeElements, hipMemcpyDeviceToDevice, stepAStream)); +#endif + } else { + assert(0); // not yet supported. + } + + HIPCHECK(hipEventRecord(e, stepAStream)); + HIPCHECK(hipStreamWaitEvent(gpu1Stream, e, 0)); + + HIPCHECK(hipMemcpyAsync(dataHost, dataGpu1, sizeElements, hipMemcpyDeviceToHost, gpu1Stream)); + + HIPCHECK(hipStreamSynchronize(gpu1Stream)); + + checkReverse(dataHost, numElements, expected); +} + + +void testMultiGpu0(int dev0, int dev1, int numElements) +{ + const size_t sizeElements = numElements * sizeof(int); + + int * dataGpu0, *dataGpu1, *dataHost; + hipStream_t gpu0Stream, gpu1Stream; + const int expected = 42; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + + HIPCHECK(hipSetDevice(dev0)); + + HIPCHECK(hipMalloc(&dataGpu0, sizeElements)); + HIPCHECK(hipStreamCreate(&gpu0Stream)); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, + dataGpu0, expected, numElements); + HIPCHECK(hipDeviceSynchronize()); + + + HIPCHECK(hipSetDevice(dev1)); + HIPCHECK(hipMalloc(&dataGpu1, sizeElements)); + HIPCHECK(hipStreamCreate(&gpu1Stream)); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, + dataGpu1, 0x34, numElements); + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipHostMalloc(&dataHost, sizeElements)); + memset(dataHost, 13, sizeElements); + +#ifdef __HIP_PLATFORM_HCC__ + hc::am_memtracker_print(0x0); +#endif + + printf (" test: init complete\n"); + + runTest(true/*stepAIsCopy*/, gpu0Stream, gpu1Stream, numElements, dataGpu0, dataGpu1, dataHost, expected); + +}; + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + + int numElements = N; + + int dev0 = 0; + int dev1 = 1; + + // TODO - only works on multi-GPU system: + if (enablePeers(dev0,dev1) == -1) { + printf ("warning : could not find peer gpus\n"); + return -1; + }; + + //testMultiGpu0(dev0, dev1, numElements); + + + + passed(); +}; From 8ce6d179830bad76e7c853d74efcfb13c53b8f61 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 4 Jun 2017 20:18:37 -0500 Subject: [PATCH 129/171] Update tests. Fix some NVCC issues. Add hipStreamSync2, record_event tests. --- tests/src/kernel/inline_asm_vadd.cpp | 2 +- tests/src/runtimeApi/event/record_event.cpp | 149 +++++++++++++++ tests/src/runtimeApi/memory/hipHostMalloc.cpp | 3 +- .../runtimeApi/memory/hipMemoryAllocate.cpp | 132 -------------- .../src/runtimeApi/stream/hipStreamSync2.cpp | 169 ++++++++++++++++++ tests/src/test_common.h | 14 ++ 6 files changed, 335 insertions(+), 134 deletions(-) create mode 100644 tests/src/runtimeApi/event/record_event.cpp delete mode 100644 tests/src/runtimeApi/memory/hipMemoryAllocate.cpp create mode 100644 tests/src/runtimeApi/stream/hipStreamSync2.cpp diff --git a/tests/src/kernel/inline_asm_vadd.cpp b/tests/src/kernel/inline_asm_vadd.cpp index 481b606e89..7a941d31af 100644 --- a/tests/src/kernel/inline_asm_vadd.cpp +++ b/tests/src/kernel/inline_asm_vadd.cpp @@ -16,7 +16,7 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTI THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s + * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvcc * RUN: %t * HIT_END */ diff --git a/tests/src/runtimeApi/event/record_event.cpp b/tests/src/runtimeApi/event/record_event.cpp new file mode 100644 index 0000000000..66027b1643 --- /dev/null +++ b/tests/src/runtimeApi/event/record_event.cpp @@ -0,0 +1,149 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + + +#include "test_common.h" + +enum SyncMode { + syncNone, + syncNullStream, + syncOtherStream, +}; + + +const char *syncModeString(int syncMode) { + switch (syncMode) { + case syncNone: + return "syncNone"; + case syncNullStream: + return "syncNullStream"; + case syncOtherStream: + return "syncOtherStream"; + default: + return "unknown"; + }; +}; + + +void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode) +{ + printf ("\ntest: syncMode=%s\n", syncModeString(syncMode)); + + size_t sizeBytes = numElements * sizeof(int); + + int count =100; + int init0 = 0; + HIPCHECK(hipMemset(C_d, init0, sizeBytes)); + for (int i=0; i0.0f); + printf ("time=%6.2f\n", t); + + HIPCHECK(hipEventElapsedTime(&t, stop, start)); + assert (t<0.0f); + printf ("negtime=%6.2f\n", t); + + HIPCHECK(hipEventElapsedTime(&t, start, start)); + assert (t==0.0f); + HIPCHECK(hipEventElapsedTime(&t, stop, stop)); + assert (t==0.0f); + + + if (stream) { + HIPCHECK(hipStreamDestroy(stream)); + } + HIPCHECK(hipEventDestroy(start)); + HIPCHECK(hipEventDestroy(stop)); + + printf ("test: OK \n"); +} + + + +void runTests(int64_t numElements) +{ + size_t sizeBytes = numElements * sizeof(int); + + printf ("test: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0); + + + int *C_h, *C_d; + HIPCHECK(hipMalloc(&C_d, sizeBytes)); + HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); + + + { + test (C_d, C_h, numElements, syncNone); + test (C_d, C_h, numElements, syncNullStream); + test (C_d, C_h, numElements, syncOtherStream); + //test (C_d, C_h, numElements, syncDevice); + } + + + HIPCHECK(hipFree(C_d)); + HIPCHECK(hipHostFree(C_h)); +} + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); + + runTests(4000000); + + passed(); +} diff --git a/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 47baf5c206..607e2a9f63 100644 --- a/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -21,11 +21,12 @@ */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * RUN: %t * HIT_END */ +#include #include"test_common.h" #define LEN 1024*1024 diff --git a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp deleted file mode 100644 index 34951f0a09..0000000000 --- a/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * RUN: %t - * HIT_END - */ - -#include"test_common.h" - -#define NUM_ELEMENTS 1024*1024*64 -#define SIZE NUM_ELEMENTS*sizeof(int) - -int p_count = 4; - - -void multiGpuHostAlloc(int allocDevice) -{ - - int numDevices; - HIPCHECK(hipGetDeviceCount(&numDevices)); - - printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices); - - - HIPCHECK(hipSetDevice(allocDevice)); - - int *Ah, *Ch; - hipHostMalloc((void**)&Ah, SIZE); - hipHostMalloc((void**)&Ch, SIZE); - - const int init = -1; - for (size_t i=0; i 1) - { - multiGpuHostAlloc(1); - } - } - - passed(); -} diff --git a/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/tests/src/runtimeApi/stream/hipStreamSync2.cpp new file mode 100644 index 0000000000..b57e120dcc --- /dev/null +++ b/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -0,0 +1,169 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + + +#include "test_common.h" + +enum SyncMode { + syncNone, + syncNullStream, + syncOtherStream, + syncMarkerThenOtherStream, + syncMarkerThenOtherNonBlockingStream, + syncDevice +}; + + +const char *syncModeString(int syncMode) { + switch (syncMode) { + case syncNone: + return "syncNone"; + case syncNullStream: + return "syncNullStream"; + case syncOtherStream: + return "syncOtherStream"; + case syncMarkerThenOtherStream: + return "syncMarkerThenOtherStream"; + case syncMarkerThenOtherNonBlockingStream: + return "syncMarkerThenOtherNonBlockingStream"; + case syncDevice: + return "syncDevice"; + default: + return "unknown"; + }; +}; + + +void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) +{ + printf ("\ntest: syncMode=%s expectMismatch=%d\n", syncModeString(syncMode), expectMismatch); + + size_t sizeBytes = numElements * sizeof(int); + + int count =100; + int init0 = 0; + HIPCHECK(hipMemset(C_d, init0, sizeBytes)); + for (int i=0; i 0); + } + + + HIPCHECK(hipStreamDestroy(otherStream)); + HIPCHECK(hipEventDestroy(e)); + + printf ("test: OK - %d mismatches (%6.2f%%)\n", mismatches, ((double)(mismatches)*100.0)/numElements); +} + + +void testEventRecord() +{ +} + + +void runTests(int64_t numElements) +{ + size_t sizeBytes = numElements * sizeof(int); + + printf ("\n\ntest: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0); + + + int *C_h, *C_d; + HIPCHECK(hipMalloc(&C_d, sizeBytes)); + HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); + + + { + test (C_d, C_h, numElements, syncNone, true /*expectMismatch*/); + test (C_d, C_h, numElements, syncNullStream, false /*expectMismatch*/); + test (C_d, C_h, numElements, syncOtherStream, true /*expectMismatch*/); + test (C_d, C_h, numElements, syncDevice, false /*expectMismatch*/); + test (C_d, C_h, numElements, syncMarkerThenOtherStream, false /*expectMismatch*/); + test (C_d, C_h, numElements, syncMarkerThenOtherNonBlockingStream, true /*expectMismatch*/); + } + + + HIPCHECK(hipFree(C_d)); + HIPCHECK(hipHostFree(C_h)); +} + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); + + runTests(40000000); + + passed(); +} diff --git a/tests/src/test_common.h b/tests/src/test_common.h index 81edca4e1e..f585fb8bca 100644 --- a/tests/src/test_common.h +++ b/tests/src/test_common.h @@ -201,6 +201,20 @@ addCountReverse( const T *A_d, } +template +__global__ void +memsetReverse( T *C_d, T val, + int64_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { + C_d[i] = val; + } +} + + template void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) { From 39c18e5e5ff1685ee84e1e35e28fb0231834e7e5 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 5 Jun 2017 00:41:18 -0500 Subject: [PATCH 130/171] Fix HIP_SYNC_NULL_STREAM=0 mode. - Fix null-stream sync - hipStreamDestroy of null stream returns hipErrorInvalidResourceHandle - Update documentation. - Add tests for null stream sync, hipEventElapsedTime. - Rename internal enum hipEventStatusRecorded to hipEventStatusComplete - refactor hipStreamWaitEvent to streamline control-flow --- include/hip/hcc_detail/hip_runtime_api.h | 10 +- src/hip_event.cpp | 74 ++++----- src/hip_hcc.cpp | 56 ++++--- src/hip_hcc_internal.h | 10 +- src/hip_stream.cpp | 30 ++-- tests/src/runtimeApi/event/record_event.cpp | 141 ++++++++++++------ .../src/runtimeApi/stream/hipStreamSync2.cpp | 72 ++++++--- 7 files changed, 249 insertions(+), 144 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index 25eac31ec6..fde38c8395 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -658,10 +658,12 @@ hipError_t hipStreamSynchronize(hipStream_t stream); * * This function inserts a wait operation into the specified stream. * All future work submitted to @p stream will wait until @p event reports completion before beginning execution. - * This function is host-asynchronous and the function may return before the wait has completed. + * + * This function only waits for commands in the current stream to complete. Notably,, this function does + * not impliciy wait for commands in the default stream to complete, even if the specified stream is + * created with hipStreamNonBlocking = 0. * * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamSynchronize, hipStreamDestroy - * */ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags); @@ -766,10 +768,10 @@ hipError_t hipEventCreate(hipEvent_t* event); * the specified stream, after all previous * commands in that stream have completed executing. * - * If hipEventRecord() has been previously called aon event, then this call will overwrite any existing state in event. + * If hipEventRecord() has been previously called on this event, then this call will overwrite any existing state in event. * * If this function is called on a an event that is currently being recorded, results are undefined - either - * outstanding recording may save state into the event, and the order is not guaranteed. This shoul be avoided. + * outstanding recording may save state into the event, and the order is not guaranteed. * * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime * diff --git a/src/hip_event.cpp b/src/hip_event.cpp index 8ef652489a..ab1c43a00b 100644 --- a/src/hip_event.cpp +++ b/src/hip_event.cpp @@ -53,15 +53,12 @@ void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, -void ihipEvent_t::setTimestamp() +void ihipEvent_t::refereshEventStatus() { bool isReady0 = _marker.is_ready(); bool isReady1; int val = 0; - if (_state == hipEventStatusRecorded) { - // already recorded, done: - return; - } else { + if (_state == hipEventStatusRecording) { // TODO - use completion-future functions to obtain ticks and timestamps: hsa_signal_t *sig = static_cast (_marker.get_native_handle()); isReady1 = _marker.is_ready(); @@ -78,12 +75,12 @@ void ihipEvent_t::setTimestamp() _timestamp = 0; } - _state = hipEventStatusRecorded; + _state = hipEventStatusComplete; } } } - if (_state != hipEventStatusRecorded) { + if (_state != hipEventStatusComplete) { //printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); } } @@ -103,12 +100,10 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem); const bool illegalFlags = (flags & ~supportedFlags) || // can't set any unsupported flags. - (flags & releaseFlags) == releaseFlags; // can't set both + (flags & releaseFlags) == releaseFlags; // can't set both release flags if (!illegalFlags) { - ihipEvent_t *eh = new ihipEvent_t(flags); - - *event = eh; + *event = new ihipEvent_t(flags); } else { e = hipErrorInvalidValue; } @@ -148,7 +143,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) ctx->locked_syncDefaultStream(true, true); event->_timestamp = hc::get_system_ticks(); - event->_state = hipEventStatusRecorded; + event->_state = hipEventStatusComplete; return ihipLogStatus(hipSuccess); } else { event->_state = hipEventStatusRecording; @@ -209,41 +204,50 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) { HIP_INIT_API(ms, start, stop); - start->setTimestamp(); - stop->setTimestamp(); - hipError_t status = hipSuccess; + *ms = 0.0f; - if (start && stop) { - // refresh status: - if ((start->_state == hipEventStatusRecorded) && (stop->_state == hipEventStatusRecorded)) { - // Common case, we have good information for both events. + if ((start == nullptr) || + (start->_flags & hipEventDisableTiming) || + (start->_state == hipEventStatusUnitialized) || (start->_state == hipEventStatusCreated) || + (stop == nullptr) || + (stop->_flags & hipEventDisableTiming) || + ( stop->_state == hipEventStatusUnitialized) || ( stop->_state == hipEventStatusCreated)) { - int64_t tickDiff = (stop->timestamp() - start->timestamp()); + // Both events must be at least recorded else return hipErrorInvalidResourceHandle - uint64_t freqHz; - hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); - if (freqHz) { - *ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f; - status = hipSuccess; - } else { - * ms = 0.0f; - status = hipErrorInvalidValue; - } + status = hipErrorInvalidResourceHandle; + + } else { + // Refresh status, if still recording... + start->refereshEventStatus(); + stop->refereshEventStatus(); + + if ((start->_state == hipEventStatusComplete) && (stop->_state == hipEventStatusComplete)) { + // Common case, we have good information for both events. + + int64_t tickDiff = (stop->timestamp() - start->timestamp()); + + uint64_t freqHz; + hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); + if (freqHz) { + *ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f; + status = hipSuccess; + } else { + * ms = 0.0f; + status = hipErrorInvalidValue; + } } else if ((start->_state == hipEventStatusRecording) || (stop->_state == hipEventStatusRecording)) { status = hipErrorNotReady; - } else if ((start->_state == hipEventStatusUnitialized) || - (stop->_state == hipEventStatusUnitialized)) { - status = hipErrorInvalidResourceHandle; + } else { + assert(0); } - } else { - status = hipErrorInvalidResourceHandle; - } + } return ihipLogStatus(status); } diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 5d8846da1e..0cdc57eaab 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -92,7 +92,8 @@ int HIP_COHERENT_HOST_ALLOC = 0; // USE_ HIP_SYNC_HOST_ALLOC int HIP_SYNC_HOST_ALLOC = 1; -// Sync on host between +// Chicken bit to sync on host to implement null stream. +// If 0, null stream synchronization is performed on the GPU int HIP_SYNC_NULL_STREAM = 1; // HIP needs to change some behavior based on HCC_OPT_FLUSH : @@ -987,11 +988,17 @@ std::string ihipCtx_t::toString() const -// Implement "default" stream syncronization -// This waits for all other streams to drain before continuing. +// This called for submissions that are sent to the null/default stream. This routine ensures +// that this new command waits for activity in the other streams to complete before proceeding. +// +// HIP_SYNC_NULL_STREAM=0 does all dependency resolutiokn on the GPU +// HIP_SYNC_NULL_STREAM=1 s legacy non-optimal mode which conservatively waits on host. +// // If waitOnSelf is set, this additionally waits for the default stream to empty. // In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to wait for other // activity, but doesn't actually block the host. If host blocking is desired, the caller should set syncHost. +// +// syncToHost causes host to wait for the stream to finish. // Note HIP_SYNC_NULL_STREAM=1 path always sync to Host. void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) { @@ -1005,34 +1012,36 @@ void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) { ihipStream_t *stream = *streamI; + // Don't wait for streams that have "opted-out" of syncing with NULL stream. + // And - don't wait for the NULL stream, unless waitOnSelf specified. + bool waitThisStream = (!(stream->_flags & hipStreamNonBlocking)) && + (waitOnSelf || (stream != _defaultStream)); + if (HIP_SYNC_NULL_STREAM) { - // Don't wait for streams that have "opted-out" of syncing with NULL stream. - // And - don't wait for the NULL stream - if (!(stream->_flags & hipStreamNonBlocking)) { - - if (waitOnSelf || (stream != _defaultStream)) { - stream->locked_wait(); - } + if (waitThisStream) { + stream->locked_wait(); } } else { - if (!(stream->_flags & hipStreamNonBlocking) && (stream != _defaultStream)) { + if (waitThisStream) { LockedAccessor_StreamCrit_t streamCrit(stream->_criticalData); // The last marker will provide appropriate visibility: if (!streamCrit->_av.get_is_empty()) { depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope)); + tprintf(DB_SYNC, " push marker to wait for stream=%s\n", ToString(stream).c_str()); + } else { + tprintf(DB_SYNC, " skipped stream=%s since it is empty\n", ToString(stream).c_str()); } } } } - // Enqueue a barrier to wait on all the barriers we sent above: if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) { LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->_criticalData); - tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams\n", depOps.size()); + tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams. sync_host=%d\n", depOps.size(), syncHost); hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker(depOps.begin(), depOps.end(), hc::accelerator_scope); if (syncHost) { defaultCf.wait(); // TODO - account for active or blocking here. @@ -1374,6 +1383,7 @@ void ihipInit() hipStream_t ihipSyncAndResolveStream(hipStream_t stream) { if (stream == hipStreamNull ) { + // Submitting to NULL stream, call locked_syncDefaultStream to wait for all other streams: ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); @@ -1382,34 +1392,38 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) #endif return ctx->_defaultStream; } else { - // All streams have to wait for legacy default stream to be empty: + // Submitting to a "normal" stream, just wait for null stream: if (!(stream->_flags & hipStreamNonBlocking)) { if (HIP_SYNC_NULL_STREAM) { - tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s host-wait on default stream\n", ToString(stream).c_str()); stream->getCtx()->_defaultStream->locked_wait(); } else { ihipStream_t *defaultStream = stream->getCtx()->_defaultStream; - tprintf(DB_SYNC, "%s marker wait default stream\n", ToString(stream).c_str()); - bool needMarker = false; + bool needGatherMarker = false; // used to gather together other markers. hc::completion_future dcf; { LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData()); - // TODO - could call create_blocking_marker(queue) + // TODO - could call create_blocking_marker(queue) or uses existing marker. if (!defaultStreamCrit->_av.get_is_empty()) { - needMarker = true; + needGatherMarker = true; - // TODO - add "none_scope". + tprintf(DB_SYNC, " %s adding marker to default %s for dependency\n", + ToString(stream).c_str(), ToString(defaultStream).c_str()); dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope); + } else { + tprintf(DB_SYNC, " %s skipping marker since default stream is empty\n", ToString(stream).c_str()); } } - if (needMarker) { + if (needGatherMarker) { // ensure any commands sent to this stream wait on the NULL stream before continuing LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); // TODO - could be "noret" version of create_blocking_marker thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope); + tprintf(DB_SYNC, " %s adding marker to wait for freshly recorded default-stream marker \n", + ToString(stream).c_str()); } } } diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index b15d5a73e4..c3f8b72311 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -586,10 +586,10 @@ private: // Data //---- // Internal event structure: enum hipEventStatus_t { - hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use. - hipEventStatusCreated = 1, - hipEventStatusRecording = 2, // event has been enqueued to record something. - hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid. + hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use. + hipEventStatusCreated = 1, // event created, but not yet Recorded + hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet. + hipEventStatusComplete = 3, // event has been recorded - timestamps are valid. } ; // TODO - rename to ihip type of some kind @@ -604,7 +604,7 @@ class ihipEvent_t { public: ihipEvent_t(unsigned flags); void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType); - void setTimestamp(); + void refereshEventStatus(); uint64_t timestamp() const { return _timestamp; } ; ihipEventType_t type() const { return _type; }; diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp index 9f1228d6f7..40aade28b9 100644 --- a/src/hip_stream.cpp +++ b/src/hip_stream.cpp @@ -93,20 +93,17 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int } else if (event->_state != hipEventStatusUnitialized) { - bool fastWait = false; - if (stream != hipStreamNull) { + + // This will user create_blocking_marker to wait on the specified queue. stream->locked_waitEvent(event); - fastWait = true; // don't use the slow host-side synchronization. - } - - if (!fastWait) { + } else { // TODO-hcc Convert to use create_blocking_marker(...) functionality. // Currently we have a super-conservative version of this - block on host, and drain the queue. // This should create a barrier packet in the target queue. + // TODO-HIP_SYNC_NULL_STREAM stream->locked_wait(); - e = hipSuccess; } } // else event not recorded, return immediately and don't create marker. @@ -150,6 +147,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/); } else { + // note this does not synchornize with the NULL stream: stream->locked_wait(); e = hipSuccess; } @@ -171,20 +169,18 @@ hipError_t hipStreamDestroy(hipStream_t stream) //--- Drain the stream: if (stream == NULL) { - ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true /*syncToHost*/); + e = hipErrorInvalidResourceHandle; // TODO - review - what happens if try to destroy null stream } else { stream->locked_wait(); - e = hipSuccess; - } - ihipCtx_t *ctx = stream->getCtx(); + ihipCtx_t *ctx = stream->getCtx(); - if (ctx) { - ctx->locked_removeStream(stream); - delete stream; - } else { - e = hipErrorInvalidResourceHandle; + if (ctx) { + ctx->locked_removeStream(stream); + delete stream; + } else { + e = hipErrorInvalidResourceHandle; + } } return ihipLogStatus(e); diff --git a/tests/src/runtimeApi/event/record_event.cpp b/tests/src/runtimeApi/event/record_event.cpp index 66027b1643..bd8a3ada8e 100644 --- a/tests/src/runtimeApi/event/record_event.cpp +++ b/tests/src/runtimeApi/event/record_event.cpp @@ -28,8 +28,8 @@ THE SOFTWARE. enum SyncMode { syncNone, - syncNullStream, - syncOtherStream, + syncStream, + syncStopEvent, }; @@ -37,19 +37,23 @@ const char *syncModeString(int syncMode) { switch (syncMode) { case syncNone: return "syncNone"; - case syncNullStream: - return "syncNullStream"; - case syncOtherStream: - return "syncOtherStream"; + case syncStream: + return "syncStream"; + case syncStopEvent: + return "syncStopEvent"; default: return "unknown"; }; }; -void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode) +void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_t stream, int waitStart, SyncMode syncMode) { - printf ("\ntest: syncMode=%s\n", syncModeString(syncMode)); + if (!(testMask & p_tests)) { + return; + } + printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", + testMask, stream, waitStart, syncModeString(syncMode)); size_t sizeBytes = numElements * sizeof(int); @@ -60,55 +64,95 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode) C_h[i] = -1; // initialize } - hipStream_t stream = 0; + hipEvent_t neverCreated=0, neverRecorded, timingDisabled; + HIPCHECK(hipEventCreate(&neverRecorded)); + HIPCHECK(hipEventCreateWithFlags(&timingDisabled, hipEventDisableTiming)); - unsigned flags=0; - if (syncMode == syncOtherStream) { - HIPCHECK(hipStreamCreateWithFlags(&stream, flags)); - } - - hipEvent_t neverCreated=0; - hipEvent_t start, stop, neverRecorded; + hipEvent_t start, stop; HIPCHECK(hipEventCreate(&start)); HIPCHECK(hipEventCreate(&stop)); - HIPCHECK(hipEventCreate(&neverRecorded)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + HIPCHECK(hipEventRecord(timingDisabled, stream)); // sandwhich a kernel: HIPCHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count); HIPCHECK(hipEventRecord(stop, stream)); - HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish... + + if (waitStart) { + HIPCHECK(hipEventSynchronize(start)); + } + + + hipError_t expectedStopError = hipSuccess; + + // How to wait for the events to finish: + switch (syncMode) { + case syncNone: + expectedStopError = hipErrorNotReady; + break; + case syncStream: + HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish... + break; + case syncStopEvent: + HIPCHECK(hipEventSynchronize(stop)); + break; + default: + assert(0); + }; + float t; - HIPCHECK_API(hipEventElapsedTime(&t, neverCreated, stop), hipErrorInvalidResourceHandle); - HIPCHECK_API(hipEventElapsedTime(&t, start, neverCreated), hipErrorInvalidResourceHandle); - HIPCHECK_API(hipEventElapsedTime(&t, neverRecorded, stop), hipErrorInvalidResourceHandle); - HIPCHECK_API(hipEventElapsedTime(&t, start, neverRecorded), hipErrorInvalidResourceHandle); - - HIPCHECK(hipEventElapsedTime(&t, start, stop)); - assert (t>0.0f); - printf ("time=%6.2f\n", t); - - HIPCHECK(hipEventElapsedTime(&t, stop, start)); - assert (t<0.0f); - printf ("negtime=%6.2f\n", t); - - HIPCHECK(hipEventElapsedTime(&t, start, start)); - assert (t==0.0f); - HIPCHECK(hipEventElapsedTime(&t, stop, stop)); - assert (t==0.0f); - - - if (stream) { - HIPCHECK(hipStreamDestroy(stream)); + hipError_t e = hipEventElapsedTime(&t, start, start); + if ((e != hipSuccess) && (e != hipErrorNotReady)) { + failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e)); } + + if (e == hipSuccess) + assert (t==0.0f); + + + // stop usually ready unless we skipped the synchronization (syncNone) + HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError); + if (e == hipSuccess) + assert (t==0.0f); + + + e = hipEventElapsedTime(&t, start, stop); + HIPCHECK_API(e, expectedStopError); + if (expectedStopError == hipSuccess) + assert (t>0.0f); + printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e)); + + e = hipEventElapsedTime(&t, stop, start); + HIPCHECK_API(e, expectedStopError); + if (expectedStopError == hipSuccess) + assert (t<0.0f); + printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e)); + + + + { + // Check some error conditions for incomplete events: + HIPCHECK_API(hipEventElapsedTime(&t, timingDisabled, stop), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipEventElapsedTime(&t, start, timingDisabled), hipErrorInvalidResourceHandle); + + HIPCHECK_API(hipEventElapsedTime(&t, neverCreated, stop), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipEventElapsedTime(&t, start, neverCreated), hipErrorInvalidResourceHandle); + + HIPCHECK_API(hipEventElapsedTime(&t, neverRecorded, stop), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipEventElapsedTime(&t, start, neverRecorded), hipErrorInvalidResourceHandle); + } + HIPCHECK(hipEventDestroy(start)); HIPCHECK(hipEventDestroy(stop)); + // Clear out everything: + HIPCHECK(hipDeviceSynchronize()); + printf ("test: OK \n"); } @@ -125,15 +169,22 @@ void runTests(int64_t numElements) HIPCHECK(hipMalloc(&C_d, sizeBytes)); HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); + hipStream_t stream; + HIPCHECK(hipStreamCreateWithFlags(&stream, 0x0)); - { - test (C_d, C_h, numElements, syncNone); - test (C_d, C_h, numElements, syncNullStream); - test (C_d, C_h, numElements, syncOtherStream); - //test (C_d, C_h, numElements, syncDevice); + //for (int waitStart=0; waitStart<2; waitStart++) { + for (int waitStart=1; waitStart>=0; waitStart--) { + unsigned W = waitStart ? 0x1000:0; + test (W | 0x01, C_d, C_h, numElements, 0 , waitStart, syncNone); + test (W | 0x02, C_d, C_h, numElements, stream, waitStart, syncNone); + test (W | 0x04, C_d, C_h, numElements, 0 , waitStart, syncStream); + test (W | 0x08, C_d, C_h, numElements, stream, waitStart, syncStream); + test (W | 0x10, C_d, C_h, numElements, 0, waitStart, syncStopEvent); + test (W | 0x20, C_d, C_h, numElements, stream, waitStart, syncStopEvent); } + HIPCHECK(hipStreamDestroy(stream)); HIPCHECK(hipFree(C_d)); HIPCHECK(hipHostFree(C_h)); } @@ -143,7 +194,7 @@ int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); - runTests(4000000); + runTests(80000000); passed(); } diff --git a/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/tests/src/runtimeApi/stream/hipStreamSync2.cpp index b57e120dcc..c6a58ce7d4 100644 --- a/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -56,9 +56,27 @@ const char *syncModeString(int syncMode) { }; -void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) +void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { - printf ("\ntest: syncMode=%s expectMismatch=%d\n", syncModeString(syncMode), expectMismatch); + + // This test sends a long-running kernel to the null stream, then tests to see if the + // specified synchronization technique is effective. + // + // Some syncMode are not expected to correctly sync (for example "syncNone"). in these + // cases the test sets expectMismatch and the check logic below will attempt to ensure that + // the undesired synchronization did not occur - ie ensure the kernel is still running and did + // not yet update the stop event. This can be tricky since if the kernel runs fast enough it + // may complete before the check. To prevent this, the addCountReverse has a count parameter + // which causes it to loop repeatedly, and the results are checked in reverse order. + // + // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results + // are checked and we test to make sure stop event has completed. + + if (!(testMask & p_tests)) { + return; + } + printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", + testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); @@ -72,13 +90,15 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expec hipStream_t otherStream = 0; unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ? hipStreamNonBlocking : hipStreamDefault; HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags)); - hipEvent_t e; - HIPCHECK(hipEventCreate(&e)); + hipEvent_t stop, otherStreamEvent; + HIPCHECK(hipEventCreate(&stop)); + HIPCHECK(hipEventCreate(&otherStreamEvent)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count); + HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: @@ -92,7 +112,10 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expec break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: - HIPCHECK(hipEventRecord(e, otherStream)); // this may wait for NULL stream depending hipStreamNonBlocking flag above + + // this may wait for NULL stream depending hipStreamNonBlocking flag above + HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: @@ -102,6 +125,14 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expec assert(0); }; + hipError_t done = hipEventQuery(stop); + + if (expectMismatch) { + assert (done == hipErrorNotReady); + } else { + assert (done == hipSuccess); + } + int mismatches = 0; int expected = init0 + count; for (int i=0; i Date: Mon, 5 Jun 2017 08:50:41 -0500 Subject: [PATCH 131/171] Enable HIP_SYNC_NULL_STREAM=0 optimization. --- src/hip_hcc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 0cdc57eaab..08a2cdbfcf 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -94,7 +94,7 @@ int HIP_SYNC_HOST_ALLOC = 1; // Chicken bit to sync on host to implement null stream. // If 0, null stream synchronization is performed on the GPU -int HIP_SYNC_NULL_STREAM = 1; +int HIP_SYNC_NULL_STREAM = 0; // HIP needs to change some behavior based on HCC_OPT_FLUSH : // TODO - set this to 1 From a9808961bd14dd3c8bf076bfe72b649f7ceaa02f Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Mon, 5 Jun 2017 11:38:28 -0500 Subject: [PATCH 132/171] Improve HIP kernel names, attributes and codegen, contributed by Alex Voicu Change-Id: I2cafbdc5a98e26c7f4fad84739c915e7dc09993c --- include/hip/hcc_detail/grid_launch_GGL.hpp | 1278 ++++++++++---------- include/hip/hcc_detail/hip_runtime.h | 11 +- include/hip/hcc_detail/host_defines.h | 2 +- 3 files changed, 660 insertions(+), 631 deletions(-) diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 2dd9a95bc6..8e3dab8482 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -89,8 +89,7 @@ namespace hip_impl dim3 dim_blocks, int group_mem_bytes, const hc::accelerator_view& acc_v, - K k, - Ts&&... args) + K k) { const auto d = hc::extent<3>{ num_blocks.z * dim_blocks.z, @@ -102,16 +101,11 @@ namespace hip_impl group_mem_bytes); try { - hc::parallel_for_each( - acc_v, - d, - [=](const hc::tiled_index<3>& idx) [[hc]] { - k(args...); - }); + hc::parallel_for_each(acc_v, d, k); } catch (std::exception& ex) { - std::cerr << "Failed in " << __FUNCTION__ << ", with exception: " - << ex.what() << std::endl; + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; throw; } } @@ -133,8 +127,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream, const char* kernel_name, - K k, - Ts&&... args) + K k) { void* lck_stream = nullptr; auto acc_v = lock_stream_hip_(stream, lck_stream); @@ -156,12 +149,11 @@ namespace hip_impl std::move(dim_blocks), group_mem_bytes, acc_v, - std::move(k), - std::forward(args)...); + std::move(k)); } catch (std::exception& ex) { - std::cerr << "Failed in " << __FUNCTION__ << ", with exception: " - << ex.what() << std::endl; + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; throw; } } @@ -175,8 +167,7 @@ namespace hip_impl dim3 dim_blocks, int group_mem_bytes, hipStream_t stream, - K k, - Ts&&... args) + K k) { grid_launch_hip_impl_( New_grid_launch_tag{}, @@ -184,9 +175,7 @@ namespace hip_impl std::move(dim_blocks), group_mem_bytes, std::move(stream), - std::move(k), - hipLaunchParm{}, - std::forward(args)...); + std::move(k)); } template @@ -199,8 +188,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream, const char* kernel_name, - K k, - Ts&&... args) + K k) { grid_launch_hip_impl_( New_grid_launch_tag{}, @@ -209,9 +197,7 @@ namespace hip_impl group_mem_bytes, std::move(stream), kernel_name, - std::move(k), - hipLaunchParm{}, - std::forward(args)...); + std::move(k)); } template @@ -223,8 +209,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream, const char* kernel_name, - K k, - Ts&& ... args) + K k) { grid_launch_hip_impl_( is_new_grid_launch_t{}, @@ -233,8 +218,7 @@ namespace hip_impl group_mem_bytes, std::move(stream), kernel_name, - std::move(k), - std::forward(args)...); + std::move(k)); } template @@ -245,8 +229,7 @@ namespace hip_impl dim3 dim_blocks, int group_mem_bytes, hipStream_t stream, - K k, - Ts&& ... args) + K k) { grid_launch_hip_impl_( is_new_grid_launch_t{}, @@ -254,610 +237,649 @@ namespace hip_impl std::move(dim_blocks), group_mem_bytes, std::move(stream), - std::move(k), - std::forward(args)...); + std::move(k)); } - namespace - { - template - constexpr - inline - T&& forward_(std::remove_reference_t& x) [[hc]] - { - return static_cast(x); - } + // TODO: these are temporary and purposefully noisy and disruptive. + #define make_kernel_name_hip(k, n)\ + HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ + HIP_kernel_functor_name_end ## _ ## n - template - struct Forwarder { - template - void operator()(Ts&&...args) const [[hc]] - { - k(forward_(args)...); - } - }; - } + #define make_kernel_functor_hip_27(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24)\ + struct make_kernel_name_hip(function_name, 25) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ + }\ + } + #define make_kernel_functor_hip_26(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ + struct make_kernel_name_hip(function_name, 24) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ + }\ + } + #define make_kernel_functor_hip_25(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ + struct make_kernel_name_hip(function_name, 23) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + __attribute__((used, flatten))\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_);\ + }\ + } + #define make_kernel_functor_hip_24(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ + struct make_kernel_name_hip(function_name, 22) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_);\ + }\ + } + #define make_kernel_functor_hip_23(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ + struct make_kernel_name_hip(function_name, 21) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_);\ + }\ + } + #define make_kernel_functor_hip_22(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ + struct make_kernel_name_hip(function_name, 20) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_);\ + }\ + } + #define make_kernel_functor_hip_21(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18)\ + struct make_kernel_name_hip(function_name, 19) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_);\ + }\ + } + #define make_kernel_functor_hip_20(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17)\ + struct make_kernel_name_hip(function_name, 18) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ + }\ + } + #define make_kernel_functor_hip_19(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16)\ + struct make_kernel_name_hip(function_name, 17) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ + }\ + } + #define make_kernel_functor_hip_18(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15)\ + struct make_kernel_name_hip(function_name, 16) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ + }\ + } + #define make_kernel_functor_hip_17(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14)\ + struct make_kernel_name_hip(function_name, 15) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_);\ + }\ + } + #define make_kernel_functor_hip_16(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13)\ + struct make_kernel_name_hip(function_name, 14) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_);\ + }\ + } + #define make_kernel_functor_hip_15(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12)\ + struct make_kernel_name_hip(function_name, 13) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_);\ + }\ + } + #define make_kernel_functor_hip_14(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11)\ + struct make_kernel_name_hip(function_name, 12) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_);\ + }\ + } + #define make_kernel_functor_hip_13(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ + struct make_kernel_name_hip(function_name, 11) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_);\ + }\ + } + #define make_kernel_functor_hip_12(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ + struct make_kernel_name_hip(function_name, 10) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ + _p9_);\ + }\ + } + #define make_kernel_functor_hip_11(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ + struct make_kernel_name_hip(function_name, 9) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ + }\ + } + #define make_kernel_functor_hip_10(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ + struct make_kernel_name_hip(function_name, 8) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ + }\ + } + #define make_kernel_functor_hip_9(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ + struct make_kernel_name_hip(function_name, 7) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ + }\ + } + #define make_kernel_functor_hip_8(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ + struct make_kernel_name_hip(function_name, 6) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ + }\ + } + #define make_kernel_functor_hip_7(\ + function_name, kernel_name, p0, p1, p2, p3, p4)\ + struct make_kernel_name_hip(function_name, 5) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ + }\ + } + #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ + struct make_kernel_name_hip(function_name, 4) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_);\ + }\ + } + #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ + struct make_kernel_name_hip(function_name, 3) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_);\ + }\ + } + #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ + struct make_kernel_name_hip(function_name, 2) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_);\ + }\ + } + #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n + #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ + struct make_kernel_name_hip(function_name, 1) {\ + std::decay_t _p0_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_);\ + }\ + } + #define make_kernel_functor_hip_2(function_name, kernel_name)\ + struct make_kernel_name_hip(function_name, 0) {\ + void operator()(const hc::tiled_index<3>&) [[hc]]\ + {\ + return kernel_name(hipLaunchParm{});\ + }\ + } + #define make_kernel_functor_hip_1(...) + #define make_kernel_functor_hip_0(...) + #define make_kernel_functor_hip_(...)\ + overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) - template - requires(Domain == {Ts...}) - inline - void grid_launch( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - Ts&&... args) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - Forwarder{}, - std::forward(args)...); - } - template - requires(Domain == {Ts...}) - inline - void grid_launch( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - Ts&&... args) - { - grid_launch_hip_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - hipLaunchParm{}, - std::forward(args)...); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - Ts&&... args) - { - grid_launch_hip_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::forward(args)...); - } - - // TODO: these are temporary, they need to be completely removed once we - // enable C++14 support and can have proper generic, variadic lambdas. - #define make_kernel_lambda_hip_26(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21, p22, p23, p24)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_,\ - const std::decay_t& _p22_,\ - const std::decay_t& _p23_,\ - const std::decay_t& _p24_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ - } - #define make_kernel_lambda_hip_25(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21, p22, p23)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_,\ - const std::decay_t& _p22_,\ - const std::decay_t& _p23_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ - } - #define make_kernel_lambda_hip_24(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21, p22)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_,\ - const std::decay_t& _p22_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_);\ - } - #define make_kernel_lambda_hip_23(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_);\ - } - #define make_kernel_lambda_hip_22(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_);\ - } - #define make_kernel_lambda_hip_21(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_);\ - } - #define make_kernel_lambda_hip_20(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_);\ - } - #define make_kernel_lambda_hip_19(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ - } - #define make_kernel_lambda_hip_18(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ - } - #define make_kernel_lambda_hip_17(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ - } - #define make_kernel_lambda_hip_16(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_);\ - } - #define make_kernel_lambda_hip_15(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_);\ - } - #define make_kernel_lambda_hip_14(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_);\ - } - #define make_kernel_lambda_hip_13(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_);\ - } - #define make_kernel_lambda_hip_12(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_);\ - } - #define make_kernel_lambda_hip_11(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_);\ - } - #define make_kernel_lambda_hip_10(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ - } - #define make_kernel_lambda_hip_9(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ - } - #define make_kernel_lambda_hip_8(kernel_name, p0, p1, p2, p3, p4, p5, p6)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ - } - #define make_kernel_lambda_hip_7(kernel_name, p0, p1, p2, p3, p4, p5)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ - } - #define make_kernel_lambda_hip_6(kernel_name, p0, p1, p2, p3, p4)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ - } - #define make_kernel_lambda_hip_5(kernel_name, p0, p1, p2, p3)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_);\ - } - #define make_kernel_lambda_hip_4(kernel_name, p0, p1, p2)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_);\ - } - #define make_kernel_lambda_hip_3(kernel_name, p0, p1)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_) [[hc]] {\ - kernel_name(_p0_, _p1_);\ - } - #define make_kernel_lambda_hip_2(kernel_name, p0)\ - [](const std::decay_t& _p0_) [[hc]] {\ - kernel_name(_p0_);\ - } - #define make_kernel_lambda_hip_1(kernel_name)\ - []() [[hc]] { return kernel_name(hipLaunchParm{}); } - - #define make_kernel_lambda_hip_(...)\ - overload_macro_hip_(make_kernel_lambda_hip_, __VA_ARGS__) + #define hipLaunchNamedKernelGGL(\ + function_name,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ...)\ + do {\ + make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ + hip_kernel_functor_impl_{__VA_ARGS__};\ + hip_impl::grid_launch_hip_(\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + #kernel_name,\ + hip_kernel_functor_impl_);\ + } while(0) #define hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - hip_impl::grid_launch_hip_(\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - #kernel_name,\ - make_kernel_lambda_hip_(kernel_name, __VA_ARGS__),\ - ##__VA_ARGS__);\ - } while(0) + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchNamedKernelGGL(\ + unnamed,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ##__VA_ARGS__);\ + } while (0) #define hipLaunchKernel(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } while(0) + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchKernelGGL(\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + hipLaunchParm{},\ + ##__VA_ARGS__);\ + } while(0) } #endif //GENERIC_GRID_LAUNCH diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 4d8876d8f4..129020d9cd 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -149,8 +149,15 @@ extern int HIP_TRACE_API; #endif /* Device feature flags */ -//TODO-HCC this is currently ignored by HCC target of HIP -#define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) +#define launch_bounds_impl0(requiredMaxThreadsPerBlock)\ + __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock))) +#define launch_bounds_impl1(\ + requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)\ + __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),\ + amdgpu_waves_per_eu(minBlocksPerMultiprocessor))) +#define select_impl_(_1, _2, impl_, ...) impl_ +#define __launch_bounds__(...) select_impl_(\ + __VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__) // Detect if we are compiling C++ mode or C mode #if defined(__cplusplus) diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h index 5864cfa0e7..140cbb0678 100644 --- a/include/hip/hcc_detail/host_defines.h +++ b/include/hip/hcc_detail/host_defines.h @@ -48,7 +48,7 @@ THE SOFTWARE. #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else //#warning "GGL global define reached" -#define __global__ __attribute__((hc, weak)) +#define __global__ __attribute__((annotate("hip__global__"), hc, used)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) From ac634bf33428d8fe844f5e8009e706cfd1612a39 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 7 Jun 2017 00:15:05 -0500 Subject: [PATCH 133/171] Enable HCC_OPT_FLUSH=1. Requires appropriate HCC with this support : commit 38e392b517a46a09a3b1c8f388e6a0db3741c510 --- src/hip_hcc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 08a2cdbfcf..d826a0cec3 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -98,7 +98,7 @@ int HIP_SYNC_NULL_STREAM = 0; // HIP needs to change some behavior based on HCC_OPT_FLUSH : // TODO - set this to 1 -int HCC_OPT_FLUSH = 0; +int HCC_OPT_FLUSH = 1; From 1efb6ce994e50015f0b1f5352c846259f6314bf2 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 7 Jun 2017 15:23:37 +0530 Subject: [PATCH 134/171] p2p_copy_coherency test: gracefully handle single gpu case Change-Id: I216663f67ef58c673136332635dab8b57079b909 --- tests/src/runtimeApi/memory/p2p_copy_coherency.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp index 459c0054c9..6bc6235454 100644 --- a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp +++ b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -156,7 +156,13 @@ int main(int argc, char *argv[]) int dev0 = 0; int dev1 = 1; - // TODO - only works on multi-GPU system: + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + if (numDevices == 1) { + printf("warning : test requires atleast two gpus\n"); + passed(); + } + if (enablePeers(dev0,dev1) == -1) { printf ("warning : could not find peer gpus\n"); return -1; From e6cafbf34207ce8f3fb7d6eb60459dae44b01f5c Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 7 Jun 2017 15:24:44 +0530 Subject: [PATCH 135/171] hipDeviceMemcpy test: make it functional on nvcc path Change-Id: Id10c79b48747ed701adbd0a233c53cd60cfa743b --- tests/src/deviceLib/hipDeviceMemcpy.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/src/deviceLib/hipDeviceMemcpy.cpp b/tests/src/deviceLib/hipDeviceMemcpy.cpp index 3843c07bb9..527df9bab1 100644 --- a/tests/src/deviceLib/hipDeviceMemcpy.cpp +++ b/tests/src/deviceLib/hipDeviceMemcpy.cpp @@ -4,7 +4,7 @@ #include "../test_common.h" -#define LEN 1030 +#define LEN 1024 #define SIZE LEN << 2 /* HIT_START @@ -17,13 +17,13 @@ __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In) { int tx = hipThreadIdx_x; - memcpy(Out + tx, In + tx, SIZE/LEN); + memcpy(Out + tx, In + tx, sizeof(uint32_t)); } __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size) { int tx = hipThreadIdx_x; - memset(ptr + tx, val, size); + memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN))); } int main() From 1c93d8592e7b548cd56ab2b2026721d55423cc50 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 7 Jun 2017 15:25:54 +0530 Subject: [PATCH 136/171] hipMemcpy-size test: reduce max size to make it work correctly on nvcc path Change-Id: I9ce9f5a9e141ffd8ddf961269010b33358e02771 --- tests/src/runtimeApi/memory/hipMemcpy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/src/runtimeApi/memory/hipMemcpy.cpp b/tests/src/runtimeApi/memory/hipMemcpy.cpp index 749ec0de77..e8e803e44c 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -304,7 +304,7 @@ void memcpytest2_sizes(size_t maxElem=0) HIPCHECK(hipMemGetInfo(&free, &total)); if (maxElem == 0) { - maxElem = free/sizeof(T)/5; + maxElem = free/sizeof(T)/8; } printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", From 6b768c2f0abb95ab31e1a758c2c341eb7547a4ab Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 7 Jun 2017 15:50:28 +0530 Subject: [PATCH 137/171] hip_hcc package: add libstdc++-static as a rpm dependency Change-Id: I83a79353492a6be3d788b7c0ce4a8f3aa740d9d9 --- packaging/hip_hcc.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packaging/hip_hcc.txt b/packaging/hip_hcc.txt index b0808aa0bc..284d97e2e5 100644 --- a/packaging/hip_hcc.txt +++ b/packaging/hip_hcc.txt @@ -42,9 +42,9 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") if(@COMPILE_HIP_ATP_MARKER@) - set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler") + set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler, libstdc++-static") else() - set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@") + set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, libstdc++-static") endif() set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") From 99e9c7cca50a01745f71eeb3869ce2e51a23992f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 7 Jun 2017 09:05:30 -0500 Subject: [PATCH 138/171] Use amHostCoherentFlag. Requires new HCC version. --- src/hip_memory.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 3ab7713afa..c04c2611c3 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -243,6 +243,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) } + hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, sizeBytes, flags); @@ -289,10 +290,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) if (flags & hipHostMallocCoherent) { amFlags = amHostCoherent; } else if (flags & hipHostMallocNonCoherent) { - amFlags = amHostPinned; + amFlags = amHostNonCoherent; } else { // depends on env variables: - amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostNonCoherent; } From 06816fb68b446c31300d85758302c9d0bb7dd425 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Thu, 8 Jun 2017 19:20:10 -0500 Subject: [PATCH 139/171] Add clang version guard so the hip_fp16.h header won't be picked up by gcc Change-Id: Ia21335a455bc93210901b44bc8c76a7f4a385b55 --- include/hip/hcc_detail/hip_fp16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h index b1ecc61cb0..4d90ec82b2 100644 --- a/include/hip/hcc_detail/hip_fp16.h +++ b/include/hip/hcc_detail/hip_fp16.h @@ -24,7 +24,7 @@ THE SOFTWARE. #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H #include "hip/hcc_detail/hip_vector_types.h" - +#if ( __clang_major__ > 3) typedef __fp16 __half; typedef __fp16 __half1 __attribute__((ext_vector_type(1))); typedef __fp16 __half2 __attribute__((ext_vector_type(2))); @@ -454,6 +454,6 @@ __device__ static inline __half2 h2trunc(const __half2 h) { a.xy = __hip_hc_ir_h2trunc_int(h.xy); return a; } - +#endif //clang_major > 3 #endif From 43df5ba6604eac6c19d37139c3d8761406fb745d Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Thu, 8 Jun 2017 19:24:22 -0500 Subject: [PATCH 140/171] Fix error related to undefined reference of __get_dynamicgroupbaseptr(). Change-Id: I14951e1725e35dd5f5e53805f81cdb58661f59f2 --- include/hip/hcc_detail/hip_runtime.h | 8 ++++---- src/device_util.cpp | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 129020d9cd..95826f9b60 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -305,7 +305,7 @@ __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask __host__ __device__ int min(int arg1, int arg2); __host__ __device__ int max(int arg1, int arg2); -__device__ ADDRESS_SPACE_3 void* __get_dynamicgroupbaseptr(); +__device__ void* __get_dynamicgroupbaseptr(); /** @@ -464,10 +464,10 @@ do {\ // Macro to replace extern __shared__ declarations // to local variable definitions #define HIP_DYNAMIC_SHARED(type, var) \ - ADDRESS_SPACE_3 type* var = \ - (ADDRESS_SPACE_3 type*)__get_dynamicgroupbaseptr(); \ + type* var = \ + (type*)__get_dynamicgroupbaseptr(); \ -#define HIP_DYNAMIC_SHARED_ATTRIBUTE ADDRESS_SPACE_3 +#define HIP_DYNAMIC_SHARED_ATTRIBUTE diff --git a/src/device_util.cpp b/src/device_util.cpp index e59a44e5ba..062372f0f4 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -1101,11 +1101,13 @@ __host__ __device__ int max(int arg1, int arg2) return (int)(hc::precise_math::fmax((float)arg1, (float)arg2)); } -__device__ ADDRESS_SPACE_3 void* __get_dynamicgroupbaseptr() -{ +__device__ void* __get_dynamicgroupbaseptr() { return hc::get_dynamic_group_segment_base_pointer(); } +__host__ void* __get_dynamicgroupbaseptr() { + return nullptr; +} // Precise Math Functions __device__ float __hip_precise_cosf(float x) { From 5339320485d79b676f8b4b65a71f3995c2ba4530 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 12 Jun 2017 09:57:17 +0530 Subject: [PATCH 141/171] Initial implementation of hipify-cmakefile Change-Id: Id365da9f887b5c3409639f000b430d093fd4f6b3 --- bin/hipify-cmakefile | 279 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 279 insertions(+) create mode 100755 bin/hipify-cmakefile diff --git a/bin/hipify-cmakefile b/bin/hipify-cmakefile new file mode 100755 index 0000000000..b11de4adc1 --- /dev/null +++ b/bin/hipify-cmakefile @@ -0,0 +1,279 @@ +#!/usr/bin/perl -w +## +# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +## +#usage hipify-cmakefile [OPTIONS] INPUT_FILE +use Getopt::Long; + +GetOptions( + "print-stats" => \$print_stats # print the command-line, like a header. + , "quiet-warnings" => \$quiet_warnings # don't print warnings on unknown CUDA functions. + , "no-output" => \$no_output # don't write any translated output to stdout. + , "inplace" => \$inplace # modify input file inplace, save backup in ".prehip" file. + , "n" => \$n # combination of print_stats + no-output. +); + +$print_stats = 1 if $n; +$no_output = 1 if $n; + +@warn_whitelist = (); + +#--- +#Stats tracking code: +@statNames = ( "macro", "include", "option", "other" ); + +#--- +#Compute total of all individual counts: +sub totalStats { + my %count = %{ shift() }; + + my $total = 0; + foreach $key ( keys %count ) { + $total += $count{$key}; + } + + return $total; +} + +#--- +sub printStats { + my $label = shift(); + my @statNames = @{ shift() }; + my %counts = %{ shift() }; + my $warnings = shift(); + my $loc = shift(); + + my $total = totalStats( \%counts ); + + printf STDERR "%s %d CUDA->HIP refs( ", $label, $total; + + foreach $stat (@statNames) { + printf STDERR "%s:%d ", $stat, $counts{$stat}; + } + + printf STDERR ") warn:%d LOC:%d", $warnings, $loc; +} + +#--- +# Add adder stats to dest. Used to add stats for current file to a running total for all files: +sub addStats { + my $dest_ref = shift(); + my %adder = %{ shift() }; + + foreach $key ( keys %adder ) { + $dest_ref->{$key} += $adder{$key}; + } +} + +#--- +sub clearStats { + my $dest_ref = shift(); + my @statNames = @{ shift() }; + + foreach $stat (@statNames) { + $dest_ref->{$stat} = 0; + } +} + +# count of transforms in all files: +my %tt; +clearStats( \%tt, \@statNames ); + +my $fileCount = @ARGV; +my $fileName = ""; + +while (@ARGV) { + $fileName = shift(@ARGV); + if ($inplace) { + my $file_prehip = "$fileName" . ".prehip"; + my $infile; + my $outfile; + if ( -e $file_prehip ) { + $infile = $file_prehip; + $outfile = $fileName; + } + else { + system("cp $fileName $file_prehip"); + $infile = $file_prehip; + $outfile = $fileName; + } + open( INFILE, "<", $infile ) or die "error: could not open $infile"; + open( OUTFILE, ">", $outfile ) or die "error: could not open $outfile"; + $OUTFILE = OUTFILE; + } + else { + open( INFILE, "<", $fileName ) or die "error: could not open $fileName"; + $OUTFILE = STDOUT; + } + + # count of transforms in this file, init to 0 here: + my %ft; + clearStats( \%ft, \@statNames ); + + my $lineCount = 0; + + undef $/; # Read whole file at once, so we can match newlines. + while () { + + # Replace find_package(CUDA) with find_package(HIP) + $ft{'include'} += s/\bfind_package[ ]*\([ ]*CUDA[ ]*[0-9.]*/find_package(HIP/ig; + + # Replace macros + $ft{'macro'} += s/\bCUDA_ADD_EXECUTABLE/HIP_ADD_EXECUTABLE/ig; + $ft{'macro'} += s/\bCUDA_ADD_LIBRARY/HIP_ADD_LIBRARY/ig; + $ft{'macro'} += s/\bCUDA_INCLUDE_DIRECTORIES/HIP_INCLUDE_DIRECTORIES/ig; + + # Replace options + $ft{'option'} += s/\bCUDA_NVCC_FLAGS/HIP_NVCC_FLAGS/ig; + $ft{'option'} += s/\bCUDA_HOST_COMPILATION_CPP/HIP_HOST_COMPILATION_CPP/ig; + $ft{'option'} += s/\bCUDA_SOURCE_PROPERTY_FORMAT/HIP_SOURCE_PROPERTY_FORMAT/ig; + + # Replace variables + $ft{'other'} += s/\bCUDA_FOUND/HIP_FOUND/ig; + $ft{'other'} += s/\bCUDA_VERSION/HIP_VERSION/ig; + $ft{'other'} += s/\bCUDA_TOOLKIT_ROOT_DIR/HIP_ROOT_DIR/ig; + + unless ($quiet_warnings) { + + #print STDERR "Check WARNINGs\n"; + # copy into array of lines, process line-by-line to show warnings: + my @lines = split /\n/, $_; + my $tmp = $_; # copies the whole file, could be a little smarter here... + my $line_num = 0; + + foreach (@lines) { + $line_num++; + + # remove any whitelisted words: + foreach $w (@warn_whitelist) { + s/\b$w\b/ZAP/; + } + + $s = warnUnsupportedSpecialFunctions($line_num); + $warnings += $s; + } + + $_ = $tmp; + } + + #-------- + # Print it! + unless ($no_output) { + print $OUTFILE "$_"; + } + $lineCount = $_ =~ tr/\n//; + } + + my $totalConverted = totalStats( \%ft ); + + if ( ( $totalConverted + $warnings ) and $print_stats ) { + printStats( "info: converted", \@statNames, \%ft, $warnings, $lineCount ); + print STDERR " in '$fileName'\n"; + print STDERR "You may need to hand-edit '$fileName' to add steps to build correctly on HCC path\n"; + } + + # Update totals for all files: + addStats( \%tt, \%ft ); + $Twarnings += $warnings; + $TlineCount += $lineCount; +} + +#-- Print total stats for all files processed: +if ( $print_stats and ( $fileCount > 1 ) ) { + print STDERR "\n"; + printStats( "info: TOTAL-converted", \@statNames, \%tt, $Twarnings, $TlineCount ); + print STDERR "\n"; +} + +#--- +sub warnUnsupportedSpecialFunctions { + my $line_num = shift; + my $m = 0; + + foreach $func ( + # macros: + "CUDA_ADD_CUFFT_TO_TARGET", + "CUDA_ADD_CUBLAS_TO_TARGET", + #"CUDA_ADD_EXECUTABLE", + #"CUDA_ADD_LIBRARY", + "CUDA_BUILD_CLEAN_TARGET", + "CUDA_COMPILE", + "CUDA_COMPILE_PTX", + "CUDA_COMPILE_FATBIN", + "CUDA_COMPILE_CUBIN", + "CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME", + #"CUDA_INCLUDE_DIRECTORIES", + "CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS", + "CUDA_SELECT_NVCC_ARCH_FLAGS", + "CUDA_WRAP_SRCS", + + # options: + "CUDA_64_BIT_DEVICE_CODE", + "CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE", + "CUDA_BUILD_CUBIN", + "CUDA_BUILD_EMULATION", + "CUDA_LINK_LIBRARIES_KEYWORD", + "CUDA_GENERATED_OUTPUT_DIR", + #"CUDA_HOST_COMPILATION_CPP", + "CUDA_HOST_COMPILER", + #"CUDA_NVCC_FLAGS", + #"CUDA_NVCC_FLAGS_", + "CUDA_PROPAGATE_HOST_FLAGS", + "CUDA_SEPARABLE_COMPILATION", + #"CUDA_SOURCE_PROPERTY_FORMAT", + "CUDA_USE_STATIC_CUDA_RUNTIME", + "CUDA_VERBOSE_BUILD", + + # others: + #"CUDA_VERSION_MAJOR", + #"CUDA_VERSION_MINOR", + #"CUDA_VERSION", + #"CUDA_VERSION_STRING", + "CUDA_HAS_FP16", + #"CUDA_TOOLKIT_ROOT_DIR", + "CUDA_SDK_ROOT_DIR", + "CUDA_INCLUDE_DIRS", + "CUDA_LIBRARIES", + "CUDA_CUFFT_LIBRARIES", + "CUDA_CUBLAS_LIBRARIES", + "CUDA_cudart_static_LIBRARY", + "CUDA_cudadevrt_LIBRARY", + "CUDA_cupti_LIBRARY", + "CUDA_curand_LIBRARY", + "CUDA_cusolver_LIBRARY", + "CUDA_cusparse_LIBRARY", + "CUDA_npp_LIBRARY", + "CUDA_nppc_LIBRARY", + "CUDA_nppi_LIBRARY", + "CUDA_npps_LIBRARY", + "CUDA_nvcuvenc_LIBRARY", + "CUDA_nvcuvid_LIBRARY" + ) + { + my $mt = m/\b($func)/g; + if ($mt) { + $m += $mt; + print STDERR " warning: $fileName:#$line_num : unsupported macro/option : $_\n"; + } + } + + return $m; +} From 5dfe207eb94bba66f62a6fba587c14defba07774 Mon Sep 17 00:00:00 2001 From: Patrick Flick Date: Sun, 4 Jun 2017 10:24:00 -0400 Subject: [PATCH 142/171] fix typo --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d04d63714f..565fd6a36d 100644 --- a/README.md +++ b/README.md @@ -134,7 +134,7 @@ The README with the procedures and tips the team used during this porting effort * **bin**: Tools and scripts to help with hip porting * **hipify** : Tool to convert CUDA code to portable CPP. Converts CUDA APIs and kernel builtins. - * **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc ill call nvcc or hcc depending on platform, and include appropriate platform-specific headers and libraries. + * **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc will call nvcc or hcc depending on platform, and include appropriate platform-specific headers and libraries. * **hipconfig** : Print HIP configuration (HIP_PATH, HIP_PLATFORM, CXX config flags, etc) * **hipexamine.sh** : Script to scan directory, find all code, and report statistics on how much can be ported with HIP (and identify likely features not yet supported) From b850a08d99184610f930e98d2370bc7a4c89f253 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 12 Jun 2017 11:19:55 +0530 Subject: [PATCH 143/171] Update directed tests README.md Change-Id: I395245454d376508f04e5a4a62c8933895cb3867 --- tests/README.md | 86 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 22 deletions(-) diff --git a/tests/README.md b/tests/README.md index cb41cc10cd..27cde7c534 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,39 +1,78 @@ # HIP testing environment. -This document explains how to use the HIP CMAKE testing environment. +This document explains how to use the HIP CMAKE testing environment. +We make use of the HIT Integrated Tester (HIT) framework to automatically find and add test cases to the CMAKE testing environment. ### Quickstart -Usage : + +HIP unit tests are integrated into the top-level cmake project. The tests depend upon the installed version of HIP. +Typical usage (paths relative to top of the HIP repo): ``` $ mkdir build $ cd build -$ cmake ../src +$ cmake .. -DCMAKE_INSTALL_PREFIX=$PWD/install $ make +$ make install +$ make build_tests $ make test ``` ### How to add a new test -The tests/src/runtimeApi/memory/hipMemtest.cpp file contains a simple unit test and is a good starting point for other tests. -Copy this to a new test name and modify tests/src/CMakefiles.txt to add the test to the build environment. - -Recent versions of the test infrastructure use a hierarchy of folders. Each folder contains src and CMakefiles.txt file. -See the CMakefiles.txt files for description of the intended purpose for each sub-directory. +The test infrastructure use a hierarchy of folders. So add the new test to the appropriate folder. +The tests/src/runtimeApi/memory/hipMemset.cpp file contains a simple unit test and is a good starting point for other tests. +Copy this to a new test name and modify it. -#### Edit CMakefiles.txt: -// Example: +### HIP Integrated Tester (HIT) + +The HIT framework sutomatically finds and adds test cases to the CMAKE testing environment. It achives this by parsing all files in the tests/src folder. +The parser looks for a code block similar to the one below. ``` -# Build the test executable: -build_hip_executable (hipMemset hipMemset.cpp) - - -# This runs the tests with the specified command-line testing. -# Multiple make_test may be specified. -make_test(hipMemset " ") +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * //Small copy + * RUN: %t -N 10 --memsetval 0x42 + * // Oddball size + * RUN: %t -N 10013 --memsetval 0x5a + * // Big copy + * RUN: %t -N 256M --memsetval 0xa6 + * HIT_END + */ ``` +In the above, BUILD commands provide instructions on how to build the test case while RUN commands provide instructions on how to execute the test case. -It is recommended to place the build and run steps adjacent in the CMakefiles.txt. +#### BUILD command + +The supported syntax for the BUILD command is: +``` +BUILD: %t %s HIPCC_OPTIONS HCC_OPTIONS NVCC_OPTIONS EXCLUDE_HIP_PLATFORM +``` +%s: refers to current source file name. Additional source files needed for the test can be specified by name (including relative path). +%t: refers to target executable named derived by removing the extension from the current source file. Alternatively a target executable name can be specified. +HIPCC_OPTIONS: All options specified after this delimiter are passed to hipcc on both HCC and NVCC platforms. +HCC_OPTIONS: All options specified after this delimiter are passed to hipcc on HCC platform only. +NVCC_OPTIONS: All options specified after this delimiter are passed to hipcc on NVCC platform only. +EXCLUDE_HIP_PLATFORM: This can be used to exclude a test case from HCC, NVCC or both platforms. + + +#### RUN command + +The supported syntax for the RUN command is: +``` +RUN: %t EXCLUDE_HIP_PLATFORM +``` +%t: refers to target executable named derived by removing the extension from the current source file. Alternatively a target executable name can be specified. +EXCLUDE_HIP_PLATFORM: This can be used to exclude a test case from HCC, NVCC or both platforms. Note that if the test has been excluded for a specific platform in the BUILD command, it is automatically excluded from the RUN command as well for the same platform. + + +#### RUN_NAMED command + +When using the RUN command, HIT will squash and append the arguments specified to the test executable name to generate the CMAKE test name. Sometimes we might want to specify a more descriptive name. The RUN_NAMED command is used for that. The supported syntax for the RUN_NAMED command is: +``` +RUN: %t CMAKE_TEST_NAME EXCLUDE_HIP_PLATFORM +``` ### Running tests: @@ -43,11 +82,14 @@ ctest ### Run subsets of all tests: ``` -# Run one test on the commandline (obtain commandline parms from CMakefiles.tst) -./hipMemset +# Run one test on the commandline +./directed_tests/runtime/memory/hipMemset -# Run all the memory tests: +# Run all the hipMemcpy tests: ctest -R Memcpy + +# Run all tests in a specific folder: +ctest -R memory ``` @@ -55,7 +97,7 @@ ctest -R Memcpy Find the test and commandline that fail: -(From the test build directory, perhaps hip/tests/build) +(From the build directory, perhaps hip/build) grep -IR hipMemcpy-modes -IR ../tests/ ../tests/src/runtimeApi/memory/hipMemcpy.cpp: * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 From ad33c9406ae7af93cffcf0a943a927b9795ace77 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 12 Jun 2017 11:20:28 +0530 Subject: [PATCH 144/171] Updated RELEASE.md Change-Id: Ic451612555c66f3ed7131514fc97fcc41091370a --- RELEASE.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index 21fd8da7bb..5787c59881 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -13,6 +13,15 @@ Upcoming: ## Revision History: +=================================================================================================== +- new APIs: hipMemcpy2DAsync, hipMallocPitch, hipHostMallocCoherent, hipHostMallocNonCoherent +- added support for building hipify-clang using clang 3.9 +- hipify-clang updates for CUDA 8.0 runtime+driver support +- renamed hipify to hipify-perl +- initial implementation of hipify-cmakefile +- several documentation updates & bug fixes + + =================================================================================================== Release: 1.0.17102 Date: 2017.03.07 From a9449533a1ca2f24bae816ab10880ba5c475f4bf Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Mon, 12 Jun 2017 11:53:25 +0530 Subject: [PATCH 145/171] Update P2P test for using memcpy and kernel tests Change-Id: Ib0f8fc9425e6e85fd11d7d02395c52bc713dcb37 --- .../runtimeApi/memory/p2p_copy_coherency.cpp | 90 +++++++++++++------ 1 file changed, 61 insertions(+), 29 deletions(-) diff --git a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp index 6bc6235454..a5d79464d0 100644 --- a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp +++ b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -33,10 +33,14 @@ THE SOFTWARE. #ifdef __HIP_PLATFORM_HCC__ #include +#define USE_HCC_MEMTRACKER 0 #endif #define USE_HSA_COPY 1 +int elementSizes[] = {16, 1024,524288}; +int nSizes = sizeof(elementSizes) / sizeof(int); + int enablePeers(int dev0, int dev1) { int canAccessPeer01, canAccessPeer10; @@ -54,16 +58,25 @@ int enablePeers(int dev0, int dev1) return 0; }; - __global__ void -memsetIntKernel(int * ptr, int val, size_t numElements) +memsetIntKernel(/*hipLaunchParm lp,*/ int * ptr, const int val, size_t numElements) { int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - if (gid < numElements) { - ptr[gid] = val; + int stride = hipBlockDim_x * hipGridDim_x ; + for (size_t i= gid; i< numElements; i+=stride){ + ptr[i] = val; } }; +__global__ void +memcpyIntKernel(/*hipLaunchParm lp, */const int * src, int* dst, size_t numElements) +{ + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; + for (size_t i= gid; i< numElements; i+=stride){ + dst[i] = src[i]; + } +}; void checkReverse(const int *ptr, int numElements, int expected) { for (int i=numElements-1; i>=0; i--) { @@ -76,52 +89,66 @@ void checkReverse(const int *ptr, int numElements, int expected) { printf ("test: OK\n"); } - -void runTest(bool stepAIsCopy, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, - int * dataGpu0, int *dataGpu1, int *dataHost, int expected) +void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, + int * dataGpu0_0, int * dataGpu0_1, int *dataGpu1, int *dataHost, int expected) { hipEvent_t e; - HIPCHECK(hipEventCreateWithFlags(&e,0)); - - printf ("test: runTest with %s\n", stepAIsCopy ? "copy" : "kernel"); + if(!hostSync) { + HIPCHECK(hipEventCreateWithFlags(&e,0)); + } const size_t sizeElements = numElements * sizeof(int); + printf ("test: runTest with %zu bytes %s with hostSync %s\n", sizeElements, stepAIsCopy ? "copy" : "kernel", hostSync ? "enabled" : "disabled"); hipStream_t stepAStream = gpu0Stream; if (stepAIsCopy) { #ifdef USE_HSA_COPY - HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0, sizeElements, hipMemcpyDeviceToDevice, stepAStream)); + HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0_0, sizeElements, hipMemcpyDeviceToDevice, stepAStream)); #endif } else { - assert(0); // not yet supported. + //assert(0); // not yet supported. + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, + dataGpu0_0, dataGpu1, numElements); } - HIPCHECK(hipEventRecord(e, stepAStream)); - HIPCHECK(hipStreamWaitEvent(gpu1Stream, e, 0)); + if(!hostSync) { + HIPCHECK(hipEventRecord(e, stepAStream)); + HIPCHECK(hipStreamWaitEvent(gpu1Stream, e, 0)); + } else { + HIPCHECK(hipStreamSynchronize(stepAStream)); + } - HIPCHECK(hipMemcpyAsync(dataHost, dataGpu1, sizeElements, hipMemcpyDeviceToHost, gpu1Stream)); + HIPCHECK(hipMemcpyAsync(dataGpu0_1, dataGpu1, sizeElements, hipMemcpyDeviceToDevice, gpu1Stream)); - HIPCHECK(hipStreamSynchronize(gpu1Stream)); + if(!hostSync) { + HIPCHECK(hipEventRecord(e, gpu1Stream)); + } else { + HIPCHECK(hipStreamSynchronize(gpu1Stream)); + } + + HIPCHECK(hipMemcpyAsync(dataHost, dataGpu0_1, sizeElements, hipMemcpyDeviceToHost, gpu0Stream)); + HIPCHECK(hipStreamSynchronize(gpu0Stream)); checkReverse(dataHost, numElements, expected); } - -void testMultiGpu0(int dev0, int dev1, int numElements) +void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync, bool useMemcpy) { const size_t sizeElements = numElements * sizeof(int); - int * dataGpu0, *dataGpu1, *dataHost; + int * dataGpu0_0, * dataGpu0_1, *dataGpu1, *dataHost; hipStream_t gpu0Stream, gpu1Stream; const int expected = 42; unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); HIPCHECK(hipSetDevice(dev0)); - HIPCHECK(hipMalloc(&dataGpu0, sizeElements)); + HIPCHECK(hipMalloc(&dataGpu0_0, sizeElements)); + HIPCHECK(hipMalloc(&dataGpu0_1, sizeElements)); HIPCHECK(hipStreamCreate(&gpu0Stream)); hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, - dataGpu0, expected, numElements); + dataGpu0_0, expected, numElements); HIPCHECK(hipDeviceSynchronize()); @@ -135,18 +162,19 @@ void testMultiGpu0(int dev0, int dev1, int numElements) HIPCHECK(hipHostMalloc(&dataHost, sizeElements)); memset(dataHost, 13, sizeElements); -#ifdef __HIP_PLATFORM_HCC__ +#if USE_HCC_MEMTRACKER hc::am_memtracker_print(0x0); #endif - + printf (" test: init complete\n"); + runTest(useMemcpy , hostSync, gpu0Stream, gpu1Stream, numElements, dataGpu0_0,dataGpu0_1, dataGpu1, dataHost, expected); - runTest(true/*stepAIsCopy*/, gpu0Stream, gpu1Stream, numElements, dataGpu0, dataGpu1, dataHost, expected); - + HIPCHECK(hipFree(dataGpu0_0)); + HIPCHECK(hipFree(dataGpu0_1)); + HIPCHECK(hipFree(dataGpu1)); + HIPCHECK(hipHostFree(dataHost)); }; - - int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); @@ -168,8 +196,12 @@ int main(int argc, char *argv[]) return -1; }; - //testMultiGpu0(dev0, dev1, numElements); - + for(int index = 1;index < nSizes;index++) { + testMultiGpu(dev0, dev1, elementSizes[index] , false /* GPU Synchronization*/, true); + testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/, true); + testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/, false); + testMultiGpu(dev0, dev1, elementSizes[index] , false /*Host Synchronization*/, false); + } passed(); From a833b9a704b780dce49870ae3e3e756904927beb Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 13 Jun 2017 13:35:50 +0530 Subject: [PATCH 146/171] Input args NULL check in hipChooseDevice Change-Id: I1a7b8cded2f81d739645bbf3dab2f04bb9c3c796 --- src/hip_device.cpp | 110 ++++++++++++++++++++++++--------------------- 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 93c1c20484..05db4c2b30 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -415,72 +415,78 @@ hipError_t hipChooseDevice( int* device, const hipDeviceProp_t* prop ) int inPropCount = 0; int matchedPropCount = 0; hipError_t e = hipSuccess; - ihipGetDeviceCount( &deviceCount ); - *device = 0; - for (int i = 0; i < deviceCount; i++) { - ihipGetDeviceProperties( &tempProp, i ); - if(prop->major != 0) { - inPropCount++; - if(tempProp.major >= prop->major) { - matchedPropCount++; - } - if(prop->minor != 0) { + if((device == NULL) || (prop == NULL)) { + e = hipErrorInvalidValue; + } + if(e == hipSuccess) { + ihipGetDeviceCount( &deviceCount ); + *device = 0; + for (int i = 0; i < deviceCount; i++) { + ihipGetDeviceProperties( &tempProp, i ); + if(prop->major != 0) { inPropCount++; - if(tempProp.minor >= prop->minor) { - matchedPropCount++; - } + if(tempProp.major >= prop->major) { + matchedPropCount++; + } + if(prop->minor != 0) { + inPropCount++; + if(tempProp.minor >= prop->minor) { + matchedPropCount++; + } + } } - } - if(prop->totalGlobalMem != 0) { - inPropCount++; - if(tempProp.totalGlobalMem >= prop->totalGlobalMem) { - matchedPropCount++; + if(prop->totalGlobalMem != 0) { + inPropCount++; + if(tempProp.totalGlobalMem >= prop->totalGlobalMem) { + matchedPropCount++; + } } - } - if(prop->sharedMemPerBlock != 0) { - inPropCount++; - if(tempProp.sharedMemPerBlock >= prop->sharedMemPerBlock) { - matchedPropCount++; + if(prop->sharedMemPerBlock != 0) { + inPropCount++; + if(tempProp.sharedMemPerBlock >= prop->sharedMemPerBlock) { + matchedPropCount++; + } } - } - if(prop->maxThreadsPerBlock != 0) { - inPropCount++; - if(tempProp.maxThreadsPerBlock >= prop->maxThreadsPerBlock ) { - matchedPropCount++; + if(prop->maxThreadsPerBlock != 0) { + inPropCount++; + if(tempProp.maxThreadsPerBlock >= prop->maxThreadsPerBlock ) { + matchedPropCount++; + } } - } - if(prop->totalConstMem != 0) { - inPropCount++; - if(tempProp.totalConstMem >= prop->totalConstMem ) { - matchedPropCount++; + if(prop->totalConstMem != 0) { + inPropCount++; + if(tempProp.totalConstMem >= prop->totalConstMem ) { + matchedPropCount++; + } } - } - if(prop->multiProcessorCount != 0) { - inPropCount++; - if(tempProp.multiProcessorCount >= prop->multiProcessorCount ) { - matchedPropCount++; + if(prop->multiProcessorCount != 0) { + inPropCount++; + if(tempProp.multiProcessorCount >= prop->multiProcessorCount ) { + matchedPropCount++; + } } - } - if(prop->maxThreadsPerMultiProcessor != 0) { - inPropCount++; - if(tempProp.maxThreadsPerMultiProcessor >= prop->maxThreadsPerMultiProcessor ) { - matchedPropCount++; + if(prop->maxThreadsPerMultiProcessor != 0) { + inPropCount++; + if(tempProp.maxThreadsPerMultiProcessor >= prop->maxThreadsPerMultiProcessor ) { + matchedPropCount++; + } } - } - if(prop->memoryClockRate != 0) { - inPropCount++; - if(tempProp.memoryClockRate >= prop->memoryClockRate ) { - matchedPropCount++; + if(prop->memoryClockRate != 0) { + inPropCount++; + if(tempProp.memoryClockRate >= prop->memoryClockRate ) { + matchedPropCount++; + } + } + if(inPropCount == matchedPropCount) { + *device = i; } - } - if(inPropCount == matchedPropCount) { - *device = i; - } #if 0 else{ e= hipErrorInvalidValue; } #endif + } } return ihipLogStatus(e); } + From f805bd0c1d5f784ec3a4dc9cf3f5f492d19b293e Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 13 Jun 2017 20:25:11 +0300 Subject: [PATCH 147/171] [HIPIFY] Initial sync HIPIFY with HIP by CUDA Driver API functions. + CUDA_Driver_API_functions_supported_by_HIP.md update. + Initial update of HIPIFY with CUDA driver API functions: 1.Error Handling, 2.Initialization, 3.Version Management, 5-6.Device Management, 7.Primary Context, 8-9.Context, 10.Module Management, 11.Memory Management. + Sync HIP functions against CUDA Driver and Runtime API functions. + Typo fixes. ToDo: 12-30 modules of CUDA Driver API. --- ...A_Driver_API_functions_supported_by_HIP.md | 123 +++++++++++-- hipify-clang/src/Cuda2Hip.cpp | 169 +++++++++++------- 2 files changed, 221 insertions(+), 71 deletions(-) diff --git a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index ad9d791a6d..d4b54438bb 100644 --- a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -386,65 +386,168 @@ | define |`CUDA_ARRAY3D_TEXTURE_GATHER` | | This flag must be set in order to perform texture gather operations on a CUDA array. | | define |`CUDA_VERSION` | | CUDA API version number. | - ## **2. Error Handling** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuGetErrorName` | | Gets the string representation of an error code enum name. | +| `cuGetErrorString` | | Gets the string description of an error code. | ## **3. Initialization** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuInit` | `hipInit` | Initialize the CUDA driver API. | ## **4. Version Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuDriverGetVersion` | `hipDriverGetVersion` | Returns the CUDA driver version. | ## **5. Device Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| `cuDriverGetVersion` | `hipGetDevice` | Returns a handle to a compute device. | +| `cuDeviceGetAttribute` | `hipDeviceGetAttribute` | Returns information about the device. | +| `cuDeviceGetCount` | `hipGetDeviceCount` | Returns the number of compute-capable devices. | +| `cuDeviceGetName` | `hipDeviceGetName` | Returns an identifer string for the device. | +| `cuDeviceTotalMem` | `hipDeviceTotalMem` | Returns the total amount of memory on the device. | ## **6. Device Management [DEPRECATED]** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuDeviceComputeCapability` | `hipDeviceComputeCapability` | Returns the compute capability of the device. | +| `cuDeviceGetProperties` | `hipGetDeviceProperties` | Returns properties for a selected device. | ## **7. Primary Context Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuDevicePrimaryCtxGetState` | `hipDevicePrimaryCtxGetState` | Get the state of the primary context. | +| `cuDevicePrimaryCtxRelease` | `hipDevicePrimaryCtxRelease` | Release the primary context on the GPU. | +| `cuDevicePrimaryCtxReset` | `hipDevicePrimaryCtxReset` | Destroy all allocations and reset all state on the primary context. | +| `cuDevicePrimaryCtxRetain` | `hipDevicePrimaryCtxRetain` | Retain the primary context on the GPU. | +| `cuDevicePrimaryCtxSetFlags` | `hipDevicePrimaryCtxSetFlags` | Set flags for the primary context. | ## **8. Context Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuCtxCreate` | `hipCtxCreate` | Create a CUDA context. | +| `cuCtxDestroy` | `hipCtxDestroy` | Destroy a CUDA context. | +| `cuCtxGetApiVersion` | `hipCtxGetApiVersion` | Gets the context's API version. | +| `cuCtxGetCacheConfig` | `hipCtxGetCacheConfig` | Returns the preferred cache configuration for the current context. | +| `cuCtxGetCurrent` | `hipCtxGetCurrent` | Returns the CUDA context bound to the calling CPU thread. | +| `cuCtxGetDevice` | `hipCtxGetDevice` | Returns the device ID for the current context. | +| `cuCtxGetFlags` | `hipCtxGetFlags` | Returns the flags for the current context. | +| `cuCtxGetLimit` | | Returns resource limits. | +| `cuCtxGetSharedMemConfig` | `hipCtxGetSharedMemConfig` | Returns the current shared memory configuration for the current context. | +| `cuCtxGetStreamPriorityRange` | | Returns numerical values that correspond to the least and greatest stream priorities. | +| `cuCtxPopCurrent` | `hipCtxPopCurrent` | Pops the current CUDA context from the current CPU thread. | +| `cuCtxPushCurrent` | `hipCtxPushCurrent` | Pushes a context on the current CPU thread. | +| `cuCtxSetCacheConfig` | `hipCtxSetCacheConfig` | Sets the preferred cache configuration for the current context. | +| `cuCtxSetCurrent` | `hipCtxSetCurrent` | Binds the specified CUDA context to the calling CPU thread. | +| `cuCtxSetLimit` | | Set resource limits. | +| `cuCtxSetSharedMemConfig` | `hipCtxSetSharedMemConfig` | Sets the shared memory configuration for the current context. | +| `cuCtxSynchronize` | `hipCtxSynchronize` | Block for a context's tasks to complete. | ## **9. Context Management [DEPRECATED]** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuCtxAttach` | | Increment a context's usage-count. | +| `cuCtxDetach` | | Decrement a context's usage-count. | ## **10. Module Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuLinkAddData` | | Add an input to a pending linker invocation. | +| `cuLinkAddFile` | | Add a file input to a pending linker invocation. | +| `cuLinkComplete` | | Complete a pending linker invocation. | +| `cuLinkCreate` | | Creates a pending JIT linker invocation. | +| `cuLinkDestroy` | | Destroys state for a JIT linker invocation. | +| `cuModuleGetFunction` | `hipModuleGetFunction` | Returns a function handle. | +| `cuModuleGetGlobal` | `hipModuleGetGlobal` | Returns a global pointer from a module. | +| `cuModuleGetSurfRef` | | Returns a handle to a surface reference. | +| `cuModuleGetTexRef` | | Returns a handle to a texture reference. | +| `cuModuleLoad` | `hipModuleLoad` | Loads a compute module. | +| `cuModuleLoadData` | `hipModuleLoadData` | Load a module's data. | +| `cuModuleLoadDataEx` | `hipModuleLoadDataEx` | Load a module's data with options. | +| `cuModuleLoadFatBinary` | | Load a module's data. | +| `cuModuleUnload` | `hipModuleUnload` | Unloads a module. | ## **11. Memory Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuArray3DCreate` | | Creates a 3D CUDA array. | +| `cuArray3DGetDescriptor` | | Get a 3D CUDA array descriptor. | +| `cuArrayCreate` | | Creates a 1D or 2D CUDA array. | +| `cuArrayDestroy` | | Destroys a CUDA array. | +| `cuArrayGetDescriptor` | | Get a 1D or 2D CUDA array descriptor. | +| `cuDeviceGetByPCIBusId` | `hipDeviceGetByPCIBusId` | Returns a handle to a compute device. | +| `cuDeviceGetPCIBusId` | `hipDeviceGetPCIBusId` | Returns a PCI Bus Id string for the device. | +| `cuIpcCloseMemHandle` | | Close memory mapped with cuIpcOpenMemHandle. | +| `cuIpcGetEventHandle` | | Gets an interprocess handle for a previously allocated event. | +| `cuIpcGetMemHandle` | | Gets an interprocess memory handle for an existing device memory allocation. | +| `cuIpcOpenEventHandle` | | Opens an interprocess event handle for use in the current process. | +| `cuIpcOpenMemHandle` | | Opens an interprocess memory handle exported from another process and returns a device pointer usable in the local process. | +| `cuMemAlloc` | `hipMalloc` | Allocates device memory. | +| `cuMemAllocHost` | | Allocates page-locked host memory. | +| `cuMemAllocManaged` | | Allocates memory that will be automatically managed by the Unified Memory system. | +| `cuMemAllocPitch` | | Allocates pitched device memory. | +| `cuMemcpy` | | Copies memory. | +| `cuMemcpy2D` | | Copies memory for 2D arrays. | +| `cuMemcpy2DAsync` | | Copies memory for 2D arrays. | +| `cuMemcpy2DUnaligned` | | Copies memory for 2D arrays. | +| `cuMemcpy3D` | | Copies memory for 3D arrays. | +| `cuMemcpy3DAsync` | | Copies memory for 3D arrays. | +| `cuMemcpy3DPeer` | | Copies memory between contexts. | +| `cuMemcpy3DPeerAsync` | | Copies memory between contexts asynchronously. | +| `cuMemcpyAsync` | | Copies memory asynchronously. | +| `cuMemcpyAtoA` | | Copies memory from Array to Array. | +| `cuMemcpyAtoD` | | Copies memory from Array to Device. | +| `cuMemcpyAtoH` | | Copies memory from Array to Host. | +| `cuMemcpyAtoHAsync` | | Copies memory from Array to Host. | +| `cuMemcpyDtoA` | | Copies memory from Device to Array. | +| `cuMemcpyDtoD` | `hipMemcpyDtoD` | Copies memory from Device to Device. | +| `cuMemcpyDtoDAsync` | `hipMemcpyDtoDAsync` | Copies memory from Device to Device. | +| `cuMemcpyDtoH` | `hipMemcpyDtoH` | Copies memory from Device to Host. | +| `cuMemcpyDtoHAsync` | `hipMemcpyDtoHAsync` | Copies memory from Device to Host. | +| `cuMemcpyHtoA` | | Copies memory from Host to Array. | +| `cuMemcpyHtoAAsync` | | Copies memory from Host to Array. | +| `cuMemcpyHtoD` | `hipMemcpyHtoD` | Copies memory from Host to Device. | +| `cuMemcpyHtoDAsync` | `hipMemcpyHtoDAsync` | Copies memory from Host to Device. | +| `cuMemcpyPeer` | | Copies device memory between two contexts. | +| `cuMemcpyPeerAsync` | | Copies device memory between two contexts asynchronously. | +| `cuMemFree` | `hipFree` | Frees device memory. | +| `cuMemFreeHost` | `hipFreeHost` | Frees page-locked host memory. | +| `cuMemGetAddressRange` | | Get information on memory allocations. | +| `cuMemGetInfo` | `hipMemGetInfo` | Gets free and total memory. | +| `cuMemHostAlloc` | `hipHostMalloc` | Allocates page-locked host memory. | +| `cuMemHostGetDevicePointer` | | Passes back device pointer of mapped pinned memory. | +| `cuMemHostGetFlags` | | Passes back flags that were used for a pinned allocation. | +| `cuMemHostRegister` | `hipHostRegister` | Registers an existing host memory range for use by CUDA. | +| `cuMemHostUnregister` | `hipHostUnregister` | Unregisters a memory range that was registered with cuMemHostRegister. | +| `cuMemsetD16` | | Initializes device memory. | +| `cuMemsetD16Async` | | Sets device memory. | +| `cuMemsetD2D16` | | Initializes device memory. | +| `cuMemsetD2D16Async` | | Sets device memory. | +| `cuMemsetD2D32` | | Initializes device memory. | +| `cuMemsetD2D32Async` | | Sets device memory. | +| `cuMemsetD2D8` | | Initializes device memory. | +| `cuMemsetD2D8Async` | | Sets device memory. | +| `cuMemsetD32` | `hipMemset` | Initializes device memory. | +| `cuMemsetD32Async` | `hipMemsetAsync` | Sets device memory. | +| `cuMemsetD2D8` | | Initializes device memory. | +| `cuMemsetD2D8Async` | | Sets device memory. | +| `cuMipmappedArrayCreate` | | Creates a CUDA mipmapped array. | +| `cuMipmappedArrayDestroy` | | Destroys a CUDA mipmapped array. | +| `cuMipmappedArrayGetLevel` | | Gets a mipmap level of a CUDA mipmapped array. | ## **12. Unified Addressing** diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 0825285b51..de4da78451 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -391,7 +391,7 @@ struct cuda2hipMap { cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30 ///////////////////////////// CUDA DRIVER API ///////////////////////////// - // enums + // structs cuda2hipRename["CUDA_ARRAY3D_DESCRIPTOR"] = {"HIP_ARRAY3D_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CUDA_ARRAY_DESCRIPTOR"] = {"HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CUDA_MEMCPY2D"] = {"HIP_MEMCPY2D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; @@ -453,16 +453,16 @@ struct cuda2hipMap { cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeLastPrefetchLocation = 4) // Context flags - cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 - cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f + cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 + cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f // Defines cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; // ((void*)0x01) @@ -882,52 +882,79 @@ struct cuda2hipMap { cuda2hipRename["CU_STREAM_MEM_OP_WRITE_VALUE_32"] = {"hipStreamBatchMemOpWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 2 cuda2hipRename["CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES"] = {"hipStreamBatchMemOpFlushRemoteWrites", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 3 + // Error Handling + cuda2hipRename["cuGetErrorName"] = {"hipGetErrorName___", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // cudaGetErrorName (hipGetErrorName) has different signature + cuda2hipRename["cuGetErrorString"] = {"hipGetErrorString___", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // cudaGetErrorString (hipGetErrorString) has different signature + // Init cuda2hipRename["cuInit"] = {"hipInit", CONV_DRIVER, API_DRIVER}; // Driver - cuda2hipRename["cuDriverGetVersion"] = {"hipDriverGetVersion", CONV_DRIVER, API_DRIVER}; + cuda2hipRename["cuDriverGetVersion"] = {"hipDriverGetVersion", CONV_DRIVER, API_DRIVER}; - // Context + // Context Management cuda2hipRename["cuCtxCreate_v2"] = {"hipCtxCreate", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuCtxDestroy_v2"] = {"hipCtxDestroy", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxPopCurrent_v2"] = {"hipCtxPopCurrent", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxPushCurrent_v2"] = {"hipCtxPushCurrent", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxSetCurrent"] = {"hipCtxSetCurrent", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxGetCurrent"] = {"hipCtxGetCurrent", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxGetDevice"] = {"hipCtxGetDevice", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuCtxGetApiVersion"] = {"hipCtxGetApiVersion", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuCtxGetCacheConfig"] = {"hipCtxGetCacheConfig", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxSetCacheConfig"] = {"hipCtxSetCacheConfig", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxSetSharedMemConfig"] = {"hipCtxSetSharedMemConfig", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxGetSharedMemConfig"] = {"hipCtxGetSharedMemConfig", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxSynchronize"] = {"hipCtxSynchronize", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxGetCurrent"] = {"hipCtxGetCurrent", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxGetDevice"] = {"hipCtxGetDevice", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuCtxGetFlags"] = {"hipCtxGetFlags", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxGetLimit"] = {"hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuCtxGetSharedMemConfig"] = {"hipCtxGetSharedMemConfig", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxGetStreamPriorityRange"] = {"hipCtxGetStreamPriorityRange", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuCtxPopCurrent_v2"] = {"hipCtxPopCurrent", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxPushCurrent_v2"] = {"hipCtxPushCurrent", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxSetCacheConfig"] = {"hipCtxSetCacheConfig", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxSetCurrent"] = {"hipCtxSetCurrent", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxSetLimit"] = {"hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuCtxSetSharedMemConfig"] = {"hipCtxSetSharedMemConfig", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxSynchronize"] = {"hipCtxSynchronize", CONV_CONTEXT, API_DRIVER}; + // Context Management [DEPRECATED] + cuda2hipRename["cuCtxAttach"] = {"hipCtxAttach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuCtxDetach"] = {"hipCtxDetach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + + // Peer Context Memory Access cuda2hipRename["cuCtxEnablePeerAccess"] = {"hipCtxEnablePeerAccess", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuCtxDisablePeerAccess"] = {"hipCtxDisablePeerAccess", CONV_CONTEXT, API_DRIVER}; - // unsupported yet by HIP - cuda2hipRename["cuCtxSetLimit"] = {"hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuCtxGetLimit"] = {"hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuDeviceCanAccessPeer"] = {"hipDeviceCanAccessPeer", CONV_DEV, API_DRIVER}; + // Primary Context Management cuda2hipRename["cuDevicePrimaryCtxGetState"] = {"hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuDevicePrimaryCtxRelease"] = {"hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuDevicePrimaryCtxRetain"] = {"hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuDevicePrimaryCtxReset"] = {"hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxRetain"] = {"hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuDevicePrimaryCtxSetFlags"] = {"hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER}; - // Device + // Device Management cuda2hipRename["cuDeviceGet"] = {"hipGetDevice", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetName"] = {"hipDeviceGetName", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetCount"] = {"hipGetDeviceCount", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetAttribute"] = {"hipDeviceGetAttribute", CONV_DEV, API_DRIVER}; - cuda2hipRename["cuDeviceGetProperties"] = {"hipGetDeviceProperties", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetPCIBusId"] = {"hipDeviceGetPCIBusId", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetByPCIBusId"] = {"hipDeviceGetByPCIBusId", CONV_DEV, API_DRIVER}; - cuda2hipRename["cuDeviceTotalMem_v2"] = {"hipDeviceTotalMem", CONV_DEV, API_DRIVER}; + + // Device Management [DEPRECATED] cuda2hipRename["cuDeviceComputeCapability"] = {"hipDeviceComputeCapability", CONV_DEV, API_DRIVER}; - cuda2hipRename["cuDeviceCanAccessPeer"] = {"hipDeviceCanAccessPeer", CONV_DEV, API_DRIVER}; + cuda2hipRename["cuDeviceGetProperties"] = {"hipGetDeviceProperties", CONV_DEV, API_DRIVER}; + + // Module Management + cuda2hipRename["cuLinkAddData"] = {"hipLinkAddData", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuLinkAddFile"] = {"hipLinkAddFile", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuLinkComplete"] = {"hipLinkComplete", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuLinkCreate"] = {"hipLinkCreate", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuLinkDestroy"] = {"hipLinkDestroy", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleGetFunction"] = {"hipModuleGetFunction", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuModuleGetGlobal_v2"] = {"hipModuleGetGlobal", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuModuleGetSurfRef"] = {"hipModuleGetSurfRef", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleGetTexRef"] = {"hipModuleGetTexRef", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleLoad"] = {"hipModuleLoad", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuModuleLoadData"] = {"hipModuleLoadData", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuModuleLoadDataEx"] = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuModuleLoadFatBinary"] = {"hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleUnload"] = {"hipModuleUnload", CONV_MODULE, API_DRIVER}; // unsupported yet by HIP [CUDA 8.0.44] // P2P Attributes @@ -960,16 +987,6 @@ struct cuda2hipMap { cuda2hipRename["cuEventRecord"] = {"hipEventRecord", CONV_EVENT, API_DRIVER}; cuda2hipRename["cuEventSynchronize"] = {"hipEventSynchronize", CONV_EVENT, API_DRIVER}; - // Module - cuda2hipRename["cuModuleGetFunction"] = {"hipModuleGetFunction", CONV_MODULE, API_DRIVER}; - cuda2hipRename["cuModuleGetGlobal_v2"] = {"hipModuleGetGlobal", CONV_MODULE, API_DRIVER}; - cuda2hipRename["cuModuleLoad"] = {"hipModuleLoad", CONV_MODULE, API_DRIVER}; - cuda2hipRename["cuModuleLoadData"] = {"hipModuleLoadData", CONV_MODULE, API_DRIVER}; - // unsupported yet by HIP - cuda2hipRename["cuModuleLoadDataEx"] = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER}; - cuda2hipRename["cuModuleLoadFatBinary"] = {"hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; - - cuda2hipRename["cuModuleUnload"] = {"hipModuleUnload", CONV_MODULE, API_DRIVER}; cuda2hipRename["cuLaunchKernel"] = {"hipModuleLaunchKernel", CONV_MODULE, API_DRIVER}; // Streams @@ -986,39 +1003,69 @@ struct cuda2hipMap { cuda2hipRename["cuStreamWaitEvent"] = {"hipStreamWaitEvent", CONV_STREAM, API_DRIVER}; // Memory management + cuda2hipRename["cuArray3DCreate"] = {"hipArray3DCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuArray3DGetDescriptor"] = {"hipArray3DGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuArrayCreate"] = {"hipArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuArrayDestroy"] = {"hipArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuArrayGetDescriptor"] = {"hipArrayGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuIpcCloseMemHandle"] = {"hipIpcCloseMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuIpcGetEventHandle"] = {"hipIpcGetEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuIpcGetMemHandle"] = {"hipIpcGetMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuIpcOpenEventHandle"] = {"hipIpcOpenEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuIpcOpenMemHandle"] = {"hipIpcOpenMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemAlloc_v2"] = {"hipMalloc", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemFree_v2"] = {"hipFree", CONV_MEM, API_DRIVER}; - - cuda2hipRename["cuMemHostAlloc"] = {"hipHostMalloc", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemFreeHost"] = {"hipHostFree", CONV_MEM, API_DRIVER}; - + cuda2hipRename["cuMemAllocHost"] = {"hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemAllocManaged"] = {"hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemAllocPitch"] = {"hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemAllocPitch due to different signatures + cuda2hipRename["cuMemcpy"] = {"hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy due to different signatures + cuda2hipRename["cuMemcpy2D"] = {"hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2D due to different signatures + cuda2hipRename["cuMemcpy2DAsync"] = {"hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2DAsync due to different signatures + cuda2hipRename["cuMemcpy2DUnaligned"] = {"hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpy3D"] = {"hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3D due to different signatures + cuda2hipRename["cuMemcpy3DAsync"] = {"hipMemcpy3DAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DAsync due to different signatures + cuda2hipRename["cuMemcpy3DPeer"] = {"hipMemcpy3DPeer__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeer due to different signatures + cuda2hipRename["cuMemcpy3DPeerAsync"] = {"hipMemcpy3DPeerAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeerAsync due to different signatures + cuda2hipRename["cuMemcpyAsync"] = {"hipMemcpyAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyAsync due to different signatures + cuda2hipRename["cuMemcpyAtoA"] = {"hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyAtoD"] = {"hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyAtoH"] = {"hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyAtoHAsync"] = {"hipMemcpyAtoHAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyDtoA"] = {"hipMemcpyDtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemcpyDtoD_v2"] = {"hipMemcpyDtoD", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemcpyDtoDAsync_v2"] = {"hipMemcpyDtoDAsync", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemcpyDtoH_v2"] = {"hipMemcpyDtoH", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemcpyDtoHAsync_v2"] = {"hipMemcpyDtoHAsync", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemcpyHtoA"] = {"hipMemcpyHtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyHtoAAsync"] = {"hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemcpyHtoD_v2"] = {"hipMemcpyHtoD", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemcpyHtoDAsync_v2"] = {"hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER}; - - // unsupported yet by HIP - cuda2hipRename["cuMemsetD8_v2"] = {"hipMemsetD8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemsetD8Async"] = {"hipMemsetD8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemsetD2D8_v2"] = {"hipMemsetD2D8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemsetD2D8Async"] = {"hipMemsetD2D8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyPeerAsync"] = {"hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeerAsync due to different signatures + cuda2hipRename["cuMemcpyPeer"] = {"hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeer due to different signatures + cuda2hipRename["cuMemFree_v2"] = {"hipFree", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemFreeHost"] = {"hipHostFree", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemGetAddressRange"] = {"hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemGetInfo_v2"] = {"hipMemGetInfo", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemHostAlloc"] = {"hipHostMalloc", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) + cuda2hipRename["cuMemHostGetDevicePointer"] = {"hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemHostGetFlags"] = {"hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) + cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostUnregister) cuda2hipRename["cuMemsetD16_v2"] = {"hipMemsetD16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD16Async"] = {"hipMemsetD16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D16_v2"] = {"hipMemsetD2D16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D16Async"] = {"hipMemsetD2D16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - - cuda2hipRename["cuMemsetD32_v2"] = {"hipMemset", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemsetD32Async"] = {"hipMemsetAsync", CONV_MEM, API_DRIVER}; - // unsupported yet by HIP cuda2hipRename["cuMemsetD2D32_v2"] = {"hipMemsetD2D32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D32Async"] = {"hipMemsetD2D32Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - - cuda2hipRename["cuMemGetInfo_v2"] = {"hipMemGetInfo", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; - + cuda2hipRename["cuMemsetD2D8_v2"] = {"hipMemsetD2D8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemsetD2D8Async"] = {"hipMemsetD2D8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemsetD32_v2"] = {"hipMemset", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemset) + cuda2hipRename["cuMemsetD32Async"] = {"hipMemsetAsync", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemsetAsync) + cuda2hipRename["cuMemsetD8_v2"] = {"hipMemsetD8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemsetD8Async"] = {"hipMemsetD8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMipmappedArrayCreate"] = {"hipMipmappedArrayCreate", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMipmappedArrayDestroy"] = {"hipMipmappedArrayDestroy", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMipmappedArrayGetLevel"] = {"hipMipmappedArrayGetLevel", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] cuda2hipRename["cuMemPrefetchAsync"] = {"hipMemPrefetchAsync___", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature) cuda2hipRename["cuMemAdvise"] = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemAdvise) cuda2hipRename["cuMemRangeGetAttribute"] = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttribute) @@ -1298,7 +1345,7 @@ struct cuda2hipMap { // Attributes cuda2hipRename["cudaDeviceGetAttribute"] = {"hipDeviceGetAttribute", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME}; // API_DRIVER ANALOGUE (CUdevice_attribute) + cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME}; // API_DRIVER ANALOGUE (CUdevice_attribute) cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME}; // 1 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1) cuda2hipRename["cudaDevAttrMaxBlockDimX"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME}; // 2 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2) cuda2hipRename["cudaDevAttrMaxBlockDimY"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME}; // 3 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3) From c17ab848821426708b3fccba75f0a1f5cefaea8d Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Mon, 12 Jun 2017 17:14:12 +0530 Subject: [PATCH 148/171] Add peer2peer bandwidth and latency test Change-Id: I6d88e4aa9f6e64096af16579eebef4740734203e --- .../hipBusBandwidth/hipBusBandwidth.cpp | 395 +++++++++++++++++- 1 file changed, 372 insertions(+), 23 deletions(-) diff --git a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 09f78543c9..b3b0b3e4a6 100644 --- a/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -16,13 +16,15 @@ int p_iterations = 10; int p_beatsperiteration=1; int p_device = 0; int p_detailed = 0; -bool p_async = 0; +bool p_async = 0; int p_alignedhost = 0; // align host allocs to this granularity, in bytes. 64 or 4096 are good values to try. -int p_onesize = 0; +int p_onesize = 0; bool p_h2d = true; bool p_d2h = true; bool p_bidir = true; +bool p_p2p = false; + //#define NO_CHECK @@ -70,7 +72,7 @@ std::string sizeToString(int size) // **************************************************************************** -hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind) +hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind ) { if (p_async) { return hipMemcpyAsync(dst, src, sizeBytes, kind, NULL); @@ -632,6 +634,9 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB) } + + + #define failed(...) \ printf ("error: ");\ printf (__VA_ARGS__);\ @@ -646,6 +651,326 @@ int parseInt(const char *str, int *output) } +void checkPeer2PeerSupport() +{ + int deviceCnt; + hipGetDeviceCount(&deviceCnt); + std::cout << "Total no. of available gpu #" << deviceCnt << "\n" << std::endl; + + for(int deviceId=0; deviceIdhost then host-->GPU2)\n\n" << std::endl; +} + +void enablePeer2Peer(int currentGpu, int peerGpu) +{ + int canAccessPeer; + + hipSetDevice(currentGpu); + hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu); + + if(canAccessPeer==1){ + hipDeviceEnablePeerAccess(peerGpu, 0); + } +} + +void disablePeer2Peer(int currentGpu, int peerGpu) +{ + int canAccessPeer; + + hipSetDevice(currentGpu); + hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu); + + if(canAccessPeer==1){ + hipDeviceDisablePeerAccess(peerGpu); + } +} + +std::string gpuIDToString(int gpuID) +{ + using namespace std; + stringstream ss; + ss << gpuID; + return ss.str(); +} + +void RunBenchmark_P2P_Unidir(ResultDatabase &resultDB) +{ + int gpuCount; + hipGetDeviceCount(&gpuCount); + + int currentGpu, peerGpu; + + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + for (currentGpu=0; currentGpu1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); + + resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t); + + if (p_onesize) { + break; + } + } + + } + + if (p_onesize) { + numMaxFloats = sizeToBytes(p_onesize) / sizeof(float); + } + + disablePeer2Peer(currentGpu, peerGpu); + + hipEventDestroy(start); + hipEventDestroy(stop); + + // Cleanup + hipFree((void*)currentGpuMem); + hipFree((void*)peerGpuMem); + CHECK_HIP_ERROR(); + + hipSetDevice(peerGpu); + hipDeviceReset(); + + hipSetDevice(currentGpu); + hipDeviceReset(); + } + + } + +} + +void RunBenchmark_P2P_Bidir(ResultDatabase &resultDB) { + + int gpuCount; + hipGetDeviceCount(&gpuCount); + + hipStream_t stream[2]; + + int currentGpu, peerGpu; + + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + for (currentGpu=0; currentGpu1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); + + resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t); + + if (p_onesize) { + break; + } + } + + } + + if (p_onesize) { + numMaxFloats = sizeToBytes(p_onesize) / sizeof(float); + } + + disablePeer2Peer(currentGpu, peerGpu); + + hipEventDestroy(start); + hipEventDestroy(stop); + + for (int i=0; i<2; i++) { + hipStreamDestroy(stream[i]); + + hipFree((void*)currentGpuMem[i]); + hipFree((void*)peerGpuMem[i]); + CHECK_HIP_ERROR(); + } + + hipSetDevice(peerGpu); + hipDeviceReset(); + + hipSetDevice(currentGpu); + hipDeviceReset(); + } + } +} + + void printConfig() { hipDeviceProp_t props; hipGetDeviceProperties(&props, p_device); @@ -662,9 +987,9 @@ void help() { printf (" --d2h : Run only device-to-host test.\n"); printf (" --h2d : Run only host-to-device test.\n"); printf (" --bidir : Run only bidir copy test.\n"); + printf (" --p2p : Run only peer2peer unidir and bidir copy tests.\n"); printf (" --verbose : Print verbose status messages as test is run.\n"); printf (" --detailed : Print detailed report (including all trials).\n"); - printf (" --async : Use hipMemcpyAsync(with NULL stream) for H2D/D2H. Default uses hipMemcpy.\n"); printf (" --onesize, -o : Only run one measurement, at specified size (in KB, or if negative in bytes)\n"); @@ -712,6 +1037,12 @@ int parseStandardArguments(int argc, char *argv[]) p_d2h = false; p_bidir = true; + } else if (!strcmp(arg, "--p2p")) { + p_h2d = false; + p_d2h = false; + p_bidir = false; + p_p2p = true; + } else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) { help(); exit(EXIT_SUCCESS); @@ -737,39 +1068,57 @@ int main(int argc, char *argv[]) { parseStandardArguments(argc, argv); - printConfig(); + if (p_p2p) { + checkPeer2PeerSupport(); - if (p_h2d) { - ResultDatabase resultDB; - RunBenchmark_H2D(resultDB); + ResultDatabase resultDB_Unidir, resultDB_Bidir; - resultDB.DumpSummary(std::cout); + RunBenchmark_P2P_Unidir(resultDB_Unidir); + RunBenchmark_P2P_Bidir(resultDB_Bidir); + + resultDB_Unidir.DumpSummary(std::cout); + resultDB_Bidir.DumpSummary(std::cout); if (p_detailed) { - resultDB.DumpDetailed(std::cout); + resultDB_Unidir.DumpDetailed(std::cout); + resultDB_Bidir.DumpDetailed(std::cout); } } + else { + printConfig(); - if (p_d2h) { - ResultDatabase resultDB; - RunBenchmark_D2H(resultDB); + if (p_h2d) { + ResultDatabase resultDB; + RunBenchmark_H2D(resultDB); - resultDB.DumpSummary(std::cout); + resultDB.DumpSummary(std::cout); - if (p_detailed) { - resultDB.DumpDetailed(std::cout); + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } + + if (p_d2h) { + ResultDatabase resultDB; + RunBenchmark_D2H(resultDB); + + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } } - } - if (p_bidir) { - ResultDatabase resultDB; - RunBenchmark_Bidir(resultDB); + if (p_bidir) { + ResultDatabase resultDB; + RunBenchmark_Bidir(resultDB); - resultDB.DumpSummary(std::cout); + resultDB.DumpSummary(std::cout); - if (p_detailed) { - resultDB.DumpDetailed(std::cout); + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } } } } From 901538da778fb3660fbec81485ced391d33559c1 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 14 Jun 2017 09:45:46 +0530 Subject: [PATCH 149/171] Bump HIP base version to 1.2 Change-Id: I8ecc164afed4383f78579ed86a5c8c11a73b0780 --- bin/hipconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/hipconfig b/bin/hipconfig index 663a1e14cd..39fdab5a99 100755 --- a/bin/hipconfig +++ b/bin/hipconfig @@ -1,7 +1,7 @@ #!/usr/bin/perl -w $HIP_BASE_VERSION_MAJOR = "1"; -$HIP_BASE_VERSION_MINOR = "0"; +$HIP_BASE_VERSION_MINOR = "2"; # Need perl > 5.10 to use logic-defined or use 5.006; use v5.10.1; From ba51d7f676ab830253a6ea1affe03c214c1ccc72 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Wed, 14 Jun 2017 15:18:57 +0530 Subject: [PATCH 150/171] Validity check of input arguments in Ipc Mem APIs Change-Id: Ia48e949d19f354f10c7e44cc2457fd4154bf6d76 --- src/hip_memory.cpp | 88 +++++++++++++++++++++++++--------------------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index c04c2611c3..ce65579e34 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -1275,70 +1275,78 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){ // Get the size of allocated pointer size_t psize; hc::accelerator acc; - hc::AmPointerInfo amPointerInfo( NULL , NULL , 0 , acc , 0 , 0 ); - am_status_t status = hc::am_memtracker_getinfo( &amPointerInfo , devPtr ); - if (status == AM_SUCCESS) { - psize = (size_t)amPointerInfo._sizeBytes; - } - else + if((handle == NULL) || (devPtr == NULL)) { hipStatus = hipErrorInvalidResourceHandle; - ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle; - // Save the size of the pointer to hipIpcMemHandle - iHandle->psize = psize; + } else { + hc::AmPointerInfo amPointerInfo( NULL , NULL , 0 , acc , 0 , 0 ); + am_status_t status = hc::am_memtracker_getinfo( &amPointerInfo , devPtr ); + if (status == AM_SUCCESS) { + psize = (size_t)amPointerInfo._sizeBytes; + } else + hipStatus = hipErrorInvalidResourceHandle; + ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle; + // Save the size of the pointer to hipIpcMemHandle + iHandle->psize = psize; #if USE_IPC - // Create HSA ipc memory - hsa_status_t hsa_status = - hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle)); - if(hsa_status!= HSA_STATUS_SUCCESS) - hipStatus = hipErrorMemoryAllocation; + // Create HSA ipc memory + hsa_status_t hsa_status = + hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle)); + if(hsa_status!= HSA_STATUS_SUCCESS) + hipStatus = hipErrorMemoryAllocation; #else - hipStatus = hipErrorRuntimeOther; + hipStatus = hipErrorRuntimeOther; #endif - + } return ihipLogStatus(hipStatus); } hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags){ HIP_INIT_API ( devPtr, &handle , flags); hipError_t hipStatus = hipSuccess; - + if(devPtr == NULL) { + hipStatus = hipErrorInvalidValue; + } else { #if USE_IPC - // Get the current device agent. - hc::accelerator acc; - hsa_agent_t *agent = static_cast(acc.get_hsa_agent()); - if(!agent) - return hipErrorInvalidResourceHandle; + // Get the current device agent. + hc::accelerator acc; + hsa_agent_t *agent = static_cast(acc.get_hsa_agent()); + if(!agent) + return hipErrorInvalidResourceHandle; - ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle; - //Attach ipc memory - auto ctx= ihipGetTlsDefaultCtx(); - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - // the peerCnt always stores self so make sure the trace actually - hsa_status_t hsa_status = - hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr); - if(hsa_status != HSA_STATUS_SUCCESS) - hipStatus = hipErrorMapBufferObjectFailed; - } + ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle; + //Attach ipc memory + auto ctx= ihipGetTlsDefaultCtx(); + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + // the peerCnt always stores self so make sure the trace actually + hsa_status_t hsa_status = + hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr); + if(hsa_status != HSA_STATUS_SUCCESS) + hipStatus = hipErrorMapBufferObjectFailed; + } #else - hipStatus = hipErrorRuntimeOther; + hipStatus = hipErrorRuntimeOther; #endif + } return ihipLogStatus(hipStatus); } hipError_t hipIpcCloseMemHandle(void *devPtr){ HIP_INIT_API ( devPtr ); hipError_t hipStatus = hipSuccess; - + if(devPtr == NULL) { + hipStatus = hipErrorInvalidValue; + } else { #if USE_IPC - hsa_status_t hsa_status = - hsa_amd_ipc_memory_detach(devPtr); - if(hsa_status != HSA_STATUS_SUCCESS) - return hipErrorInvalidResourceHandle; + hsa_status_t hsa_status = + hsa_amd_ipc_memory_detach(devPtr); + if(hsa_status != HSA_STATUS_SUCCESS) + return hipErrorInvalidResourceHandle; #else - hipStatus = hipErrorRuntimeOther; + hipStatus = hipErrorRuntimeOther; #endif + } return ihipLogStatus(hipStatus); } From fd36303c24fcbfd21ac9bfba56162a51317f783e Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Wed, 14 Jun 2017 11:10:52 -0500 Subject: [PATCH 151/171] Additional GGL make_kernel_functor_* macros, contributed by Alex Change-Id: I01aabb7d2b5418fcefb1bbf78eb5d1888dbc5c96 --- include/hip/hcc_detail/grid_launch_GGL.hpp | 122 +++++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/include/hip/hcc_detail/grid_launch_GGL.hpp b/include/hip/hcc_detail/grid_launch_GGL.hpp index 8e3dab8482..eac48b595e 100644 --- a/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -245,6 +245,128 @@ namespace hip_impl HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ HIP_kernel_functor_name_end ## _ ## n + #define make_kernel_functor_hip_30(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25, p26, p27)\ + struct make_kernel_name_hip(function_name, 28) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + std::decay_t _p26_;\ + std::decay_t _p27_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ + _p26_, _p27_);\ + }\ + } + #define make_kernel_functor_hip_29(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25, p26)\ + struct make_kernel_name_hip(function_name, 27) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + std::decay_t _p26_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ + _p26_);\ + }\ + } + #define make_kernel_functor_hip_28(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25)\ + struct make_kernel_name_hip(function_name, 26) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\ + }\ + } #define make_kernel_functor_hip_27(\ function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ From 0208fa4e70899cbd2af550e65425ccbb9c7c8414 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 14 Jun 2017 19:55:55 +0300 Subject: [PATCH 152/171] [HIPIFY] Sync HIPIFY with HIP by CUDA Driver API functions. + 4.12. Unified Addressing + 4.13. Stream Management ToDo: 4.14 - 4.31 modules of CUDA Driver API. --- ...A_Driver_API_functions_supported_by_HIP.md | 22 +++++++- hipify-clang/src/Cuda2Hip.cpp | 54 ++++++++++--------- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index d4b54438bb..0b3bb540bf 100644 --- a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -553,13 +553,31 @@ | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuMemAdvise` | | Advise about the usage of a given memory range. | +| `cuMemPrefetchAsync` | | Prefetches memory to the specified destination device. | +| `cuMemRangeGetAttribute` | | Query an attribute of a given memory range. | +| `cuMemRangeGetAttributes` | | Query attributes of a given memory range. | +| `cuPointerGetAttribute` | | Returns information about a pointer. | +| `cuPointerGetAttributes` | | Returns information about a pointer. | +| `cuPointerSetAttribute` | | Set attributes on a previously allocated memory region. | ## **13. Stream Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuStreamAddCallback` | | Add a callback to a compute stream. | +| `cuStreamAttachMemAsync` | | Attach memory to a stream asynchronously. | +| `cuStreamCreate` | | Create a stream. | +| `cuStreamCreateWithPriority` | | Create a stream with the given priority. | +| `cuStreamDestroy` | `hipStreamDestroy` | Destroys a stream. | +| `cuStreamGetFlags` | `hipStreamGetFlags` | Query the flags of a given stream. | +| `cuStreamGetPriority` | `hipStreamGetPriority` | Query the priority of a given stream. | +| `cuStreamQuery` | `hipStreamQuery` | Determine status of a compute stream. | +| `cuStreamSynchronize` | `hipStreamSynchronize` | Wait until a stream's tasks are completed. | +| `cuStreamWaitEvent` | `hipStreamWaitEvent` | Make a compute stream wait on an event. | +| `cuStreamBatchMemOp` | | Batch operations to synchronize the stream via memory operations. | +| `cuStreamWaitValue32` | | Wait on a memory location. | +| `cuStreamWriteValue32` | | Write a value to memory. | ## **14. Event Management** diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index de4da78451..7f9fefa7f9 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -990,17 +990,19 @@ struct cuda2hipMap { cuda2hipRename["cuLaunchKernel"] = {"hipModuleLaunchKernel", CONV_MODULE, API_DRIVER}; // Streams - // unsupported yet by HIP cuda2hipRename["cuStreamAddCallback"] = {"hipStreamAddCallback", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuStreamWaitValue32"] = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE - cuda2hipRename["cuStreamWriteValue32"] = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE - cuda2hipRename["cuStreamBatchMemOp"] = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE - - cuda2hipRename["cuStreamCreate"] = {"hipStreamCreate", CONV_STREAM, API_DRIVER}; + cuda2hipRename["cuStreamAttachMemAsync"] = {"hipStreamAttachMemAsync", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuStreamCreate"] = {"hipStreamCreate__", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaStreamCreate due to different signatures + cuda2hipRename["cuStreamCreateWithPriority"] = {"hipStreamCreateWithPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuStreamDestroy_v2"] = {"hipStreamDestroy", CONV_STREAM, API_DRIVER}; + cuda2hipRename["cuStreamGetFlags"] = {"hipStreamGetFlags", CONV_STREAM, API_DRIVER}; + cuda2hipRename["cuStreamGetPriority"] = {"hipStreamGetPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuStreamQuery"] = {"hipStreamQuery", CONV_STREAM, API_DRIVER}; cuda2hipRename["cuStreamSynchronize"] = {"hipStreamSynchronize", CONV_STREAM, API_DRIVER}; cuda2hipRename["cuStreamWaitEvent"] = {"hipStreamWaitEvent", CONV_STREAM, API_DRIVER}; + cuda2hipRename["cuStreamWaitValue32"] = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuStreamWriteValue32"] = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuStreamBatchMemOp"] = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE // Memory management cuda2hipRename["cuArray3DCreate"] = {"hipArray3DCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; @@ -1016,16 +1018,16 @@ struct cuda2hipMap { cuda2hipRename["cuMemAlloc_v2"] = {"hipMalloc", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemAllocHost"] = {"hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemAllocManaged"] = {"hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemAllocPitch"] = {"hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemAllocPitch due to different signatures - cuda2hipRename["cuMemcpy"] = {"hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy due to different signatures - cuda2hipRename["cuMemcpy2D"] = {"hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2D due to different signatures - cuda2hipRename["cuMemcpy2DAsync"] = {"hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2DAsync due to different signatures + cuda2hipRename["cuMemAllocPitch"] = {"hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemAllocPitch due to different signatures + cuda2hipRename["cuMemcpy"] = {"hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy due to different signatures + cuda2hipRename["cuMemcpy2D"] = {"hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2D due to different signatures + cuda2hipRename["cuMemcpy2DAsync"] = {"hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2DAsync due to different signatures cuda2hipRename["cuMemcpy2DUnaligned"] = {"hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemcpy3D"] = {"hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3D due to different signatures - cuda2hipRename["cuMemcpy3DAsync"] = {"hipMemcpy3DAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DAsync due to different signatures - cuda2hipRename["cuMemcpy3DPeer"] = {"hipMemcpy3DPeer__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeer due to different signatures - cuda2hipRename["cuMemcpy3DPeerAsync"] = {"hipMemcpy3DPeerAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeerAsync due to different signatures - cuda2hipRename["cuMemcpyAsync"] = {"hipMemcpyAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyAsync due to different signatures + cuda2hipRename["cuMemcpy3D"] = {"hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3D due to different signatures + cuda2hipRename["cuMemcpy3DAsync"] = {"hipMemcpy3DAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DAsync due to different signatures + cuda2hipRename["cuMemcpy3DPeer"] = {"hipMemcpy3DPeer__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeer due to different signatures + cuda2hipRename["cuMemcpy3DPeerAsync"] = {"hipMemcpy3DPeerAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeerAsync due to different signatures + cuda2hipRename["cuMemcpyAsync"] = {"hipMemcpyAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyAsync due to different signatures cuda2hipRename["cuMemcpyAtoA"] = {"hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemcpyAtoD"] = {"hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemcpyAtoH"] = {"hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; @@ -1039,17 +1041,17 @@ struct cuda2hipMap { cuda2hipRename["cuMemcpyHtoAAsync"] = {"hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemcpyHtoD_v2"] = {"hipMemcpyHtoD", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemcpyHtoDAsync_v2"] = {"hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemcpyPeerAsync"] = {"hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeerAsync due to different signatures - cuda2hipRename["cuMemcpyPeer"] = {"hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeer due to different signatures + cuda2hipRename["cuMemcpyPeerAsync"] = {"hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeerAsync due to different signatures + cuda2hipRename["cuMemcpyPeer"] = {"hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeer due to different signatures cuda2hipRename["cuMemFree_v2"] = {"hipFree", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemFreeHost"] = {"hipHostFree", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemGetAddressRange"] = {"hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemGetInfo_v2"] = {"hipMemGetInfo", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemHostAlloc"] = {"hipHostMalloc", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) + cuda2hipRename["cuMemHostAlloc"] = {"hipHostMalloc", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) cuda2hipRename["cuMemHostGetDevicePointer"] = {"hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemHostGetFlags"] = {"hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) - cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostUnregister) + cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) + cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostUnregister) cuda2hipRename["cuMemsetD16_v2"] = {"hipMemsetD16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD16Async"] = {"hipMemsetD16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D16_v2"] = {"hipMemsetD2D16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; @@ -1058,18 +1060,22 @@ struct cuda2hipMap { cuda2hipRename["cuMemsetD2D32Async"] = {"hipMemsetD2D32Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D8_v2"] = {"hipMemsetD2D8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D8Async"] = {"hipMemsetD2D8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemsetD32_v2"] = {"hipMemset", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemset) - cuda2hipRename["cuMemsetD32Async"] = {"hipMemsetAsync", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemsetAsync) + cuda2hipRename["cuMemsetD32_v2"] = {"hipMemset", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemset) + cuda2hipRename["cuMemsetD32Async"] = {"hipMemsetAsync", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemsetAsync) cuda2hipRename["cuMemsetD8_v2"] = {"hipMemsetD8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD8Async"] = {"hipMemsetD8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMipmappedArrayCreate"] = {"hipMipmappedArrayCreate", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMipmappedArrayDestroy"] = {"hipMipmappedArrayDestroy", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMipmappedArrayGetLevel"] = {"hipMipmappedArrayGetLevel", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["cuMemPrefetchAsync"] = {"hipMemPrefetchAsync___", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature) + + // Unified Addressing + cuda2hipRename["cuMemPrefetchAsync"] = {"hipMemPrefetchAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature) cuda2hipRename["cuMemAdvise"] = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemAdvise) cuda2hipRename["cuMemRangeGetAttribute"] = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttribute) cuda2hipRename["cuMemRangeGetAttributes"] = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttributes) + cuda2hipRename["cuPointerGetAttribute"] = {"hipPointerGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cuPointerGetAttributes"] = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cuPointerSetAttribute"] = {"hipPointerSetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Texture Reference Mngmnt // Texture reference filtering modes From d24818bff6e2f202b031e92d0b0f2c27a31757e5 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 15 Jun 2017 00:21:47 +0530 Subject: [PATCH 153/171] Arguments validation in hipDeviceGetPCIBusId Change-Id: I89770517c3ac94e4bf476344d27c18f03cfcde08 --- src/hip_device.cpp | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/hip_device.cpp b/src/hip_device.cpp index 05db4c2b30..2bb9970d35 100644 --- a/src/hip_device.cpp +++ b/src/hip_device.cpp @@ -369,12 +369,24 @@ hipError_t hipDeviceGetName(char *name,int len,hipDevice_t device) hipError_t hipDeviceGetPCIBusId (char *pciBusId,int len, int device) { HIP_INIT_API(pciBusId, len, device); - hipError_t e = hipSuccess; - int tempPciBusId = 0; - e = ihipDeviceGetAttribute( &tempPciBusId, hipDeviceAttributePciBusId, device); - if( e == hipSuccess) { - std::string tempPciStr = std::to_string(tempPciBusId); - memcpy( pciBusId , tempPciStr.c_str() , tempPciStr.length() ); + hipError_t e = hipErrorInvalidValue; + int deviceCount = 0; + ihipGetDeviceCount( &deviceCount ); + if((device > deviceCount) || (device < 0)) { + e = hipErrorInvalidDevice; + } else { + if((pciBusId != nullptr) && (len > 0)) { + int tempPciBusId = 0; + e = ihipDeviceGetAttribute( &tempPciBusId, hipDeviceAttributePciBusId, device); + if( e == hipSuccess) { + std::string tempPciStr = std::to_string(tempPciBusId); + if( len < tempPciStr.length()){ + e = hipErrorInvalidValue; + } else { + memcpy( pciBusId , tempPciStr.c_str() , tempPciStr.length() ); + } + } + } } return ihipLogStatus(e); } From 64bb8d154a9ca093a93b8c945a469d1ac1686c66 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 16 Jun 2017 09:02:26 -0500 Subject: [PATCH 154/171] removed bad copy constructor Change-Id: I661991d9d43941a61848b0b8e9879c0bfa811b40 --- include/hip/hcc_detail/hip_vector_types.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h index 9da34d9f32..93c82cc0cb 100644 --- a/include/hip/hcc_detail/hip_vector_types.h +++ b/include/hip/hcc_detail/hip_vector_types.h @@ -36,25 +36,21 @@ THE SOFTWARE. #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ __device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x) { } \ __device__ __host__ type(const type& val) : x(val.x) { } \ __device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ __device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ __device__ __host__ type(const type& val) : x(val.x), y(val.y) { } \ __device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ __device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ __device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } \ __device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ __device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ __device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ __device__ __host__ ~type() {} From d1e28df22601a0db2e345e11c68cdc92a38da063 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 16 Jun 2017 09:07:06 -0500 Subject: [PATCH 155/171] fixed float2int functions Change-Id: I67be79149f06daacf0f0d131bdedabf294126248 --- src/device_functions.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/device_functions.cpp b/src/device_functions.cpp index 10d8d3ab89..615ae4d0b7 100644 --- a/src/device_functions.cpp +++ b/src/device_functions.cpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights @@ -149,19 +149,19 @@ __device__ long long int __double_as_longlong(double x) return hold64.sli; } -__device__ int float2int_rd(float x) +__device__ int __float2int_rd(float x) { return (int)x; } -__device__ int float2int_rn(float x) +__device__ int __float2int_rn(float x) { return (int)x; } -__device__ int float2int_ru(float x) +__device__ int __float2int_ru(float x) { return (int)x; } -__device__ int float2int_rz(float x) +__device__ int __float2int_rz(float x) { return (int)x; } From 3c73229916787bfb7e0ab9549d2fb877b20729bd Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Sun, 18 Jun 2017 12:31:31 +0530 Subject: [PATCH 156/171] Abort device function in HIP/HCC, need new HCC Change-Id: I4195ab75e9b7b48c8b8128d6925ddc0fa5e9e009 --- include/hip/hcc_detail/hip_runtime.h | 3 +++ src/device_util.cpp | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h index 95826f9b60..da3b7ba50e 100644 --- a/include/hip/hcc_detail/hip_runtime.h +++ b/include/hip/hcc_detail/hip_runtime.h @@ -174,6 +174,9 @@ static constexpr int warpSize = 64; __device__ long long int clock64(); __device__ clock_t clock(); +//abort +__device__ void abort(); + //atomicAdd() __device__ int atomicAdd(int* address, int val); __device__ unsigned int atomicAdd(unsigned int* address, diff --git a/src/device_util.cpp b/src/device_util.cpp index 062372f0f4..1efda02933 100644 --- a/src/device_util.cpp +++ b/src/device_util.cpp @@ -839,6 +839,11 @@ __device__ float __hip_ynf(int n, float x) __device__ long long int clock64() { return (long long int)hc::__cycle_u64(); }; __device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); }; +//abort +__device__ void abort() +{ + return hc::abort(); +} //atomicAdd() __device__ int atomicAdd(int* address, int val) From e6e4fe613c89fc0b3ae0d4fd0b23f59a4e69a042 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 20 Jun 2017 09:38:56 +0530 Subject: [PATCH 157/171] Added device side abort function in HIP/NVCC Change-Id: I6ae35a72a8b9c34852619f02da1a046c8d3b2ed3 --- include/hip/nvcc_detail/hip_runtime.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/hip/nvcc_detail/hip_runtime.h b/include/hip/nvcc_detail/hip_runtime.h index 80da388007..8c08f3d151 100644 --- a/include/hip/nvcc_detail/hip_runtime.h +++ b/include/hip/nvcc_detail/hip_runtime.h @@ -109,6 +109,10 @@ kernelName<<>>(__VA_ARGS__);\ #define HIP_DYNAMIC_SHARED_ATTRIBUTE +#ifdef __HIP_DEVICE_COMPILE__ +#define abort() {asm("trap;");} +#endif + #endif #endif From 871c2fc8d6b8854394e2486515b91a1ba5bc8232 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 20 Jun 2017 11:35:52 -0500 Subject: [PATCH 158/171] removed rm for /opt/rocm/hip/src in inline asm sample Change-Id: I0c02bccd4cd35e01a8e889ea1e586ea8baf0ab90 --- samples/2_Cookbook/10_inline_asm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/2_Cookbook/10_inline_asm/Makefile b/samples/2_Cookbook/10_inline_asm/Makefile index 77a7699635..6ad3c201bd 100644 --- a/samples/2_Cookbook/10_inline_asm/Makefile +++ b/samples/2_Cookbook/10_inline_asm/Makefile @@ -32,4 +32,4 @@ test: $(EXECUTABLE) clean: rm -f $(EXECUTABLE) rm -f $(OBJECTS) - rm -f $(HIP_PATH)/src/*.o + From b4a39664f0359c7769dca51664fb0b173a1a10a4 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 22 Jun 2017 21:53:32 +0300 Subject: [PATCH 159/171] [HIPIFY] Sync more CUDA Driver API functions. + 4.14. Event Management + 4.15. Execution Control ToDo: 4.16 - 4.31 modules of CUDA Driver API. --- .../CUDA_Driver_API_functions_supported_by_HIP.md | 11 ++++++++++- hipify-clang/src/Cuda2Hip.cpp | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index 0b3bb540bf..d797b31832 100644 --- a/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -583,12 +583,21 @@ | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuEventCreate` | `hipEventCreate` | Creates an event. | +| `cuEventDestroy` | `hipEventDestroy` | Destroys an event. | +| `cuEventElapsedTime` | `hipEventElapsedTime` | Computes the elapsed time between two events. | +| `cuEventQuery` | `hipEventQuery` | Queries an event's status. | +| `cuEventRecord` | `hipEventRecord` | Records an event. | +| `cuEventSynchronize` | `hipEventSynchronize` | Waits for an event to complete. | ## **15. Execution Control** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| `cuFuncGetAttribute` | | Returns information about a function. | +| `cuFuncSetCacheConfig` | `hipFuncSetCacheConfig` | Sets the preferred cache configuration for a device function. | +| `cuFuncSetSharedMemConfig` | | Sets the shared memory configuration for a device function. | +| `cuLaunchKernel` | `hipModuleLaunchKernel` | Launches a CUDA function. | ## **16. Execution Control [DEPRECATED]** diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp index 7f9fefa7f9..9b58173899 100644 --- a/hipify-clang/src/Cuda2Hip.cpp +++ b/hipify-clang/src/Cuda2Hip.cpp @@ -987,6 +987,10 @@ struct cuda2hipMap { cuda2hipRename["cuEventRecord"] = {"hipEventRecord", CONV_EVENT, API_DRIVER}; cuda2hipRename["cuEventSynchronize"] = {"hipEventSynchronize", CONV_EVENT, API_DRIVER}; + // Execution Control + cuda2hipRename["cuFuncGetAttribute"] = {"hipFuncGetAttribute", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuFuncSetCacheConfig"] = {"hipFuncSetCacheConfig", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuFuncSetSharedMemConfig"] = {"hipFuncSetSharedMemConfig", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuLaunchKernel"] = {"hipModuleLaunchKernel", CONV_MODULE, API_DRIVER}; // Streams From d239b1a3fc2ac7d2c47cc91d72bcc349efde9134 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 23 Jun 2017 21:59:24 +0300 Subject: [PATCH 160/171] [HIPIFY] [DOC] Fix typo. --- hipify-clang/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hipify-clang/README.md b/hipify-clang/README.md index c0d74dbe48..d74c53f187 100644 --- a/hipify-clang/README.md +++ b/hipify-clang/README.md @@ -70,7 +70,7 @@ To set additional options like Language Selection (only "-x cuda" is supported), Delimiter "--" is used to separate hipify-clang options (before the delimiter) from clang options (after the delimiter). It is strongly recommended to always specify the delimiter, even if there are no clang specific options at all, in order to avoid possible errors regarding compilation database; in such case delimeter should be the last option in hipify-clang's command line. -Option "-x clang" is also worth specifying in order to convert source CUDA files with extensions other than standard extensions (*.cu, *.cuh). +Option "-x cuda" is also worth specifying in order to convert source CUDA files with extensions other than standard extensions (*.cu, *.cuh). ## Disclaimer From 7912e615022de55874fbd09305fcf4e355ffa182 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 23 Jun 2017 10:38:29 -0500 Subject: [PATCH 161/171] Clean up old USE_* and RELEASE.md notes. --- RELEASE.md | 6 ------ include/hip/hcc_detail/host_defines.h | 5 ----- src/hip_hcc.cpp | 3 --- tests/src/hipPointerAttrib.cpp | 9 --------- tests/src/runtimeApi/memory/p2p_copy_coherency.cpp | 4 ---- 5 files changed, 27 deletions(-) diff --git a/RELEASE.md b/RELEASE.md index 5787c59881..d6f3ec594c 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -2,12 +2,6 @@ We have attempted to document known bugs and limitations - in particular the [HIP Kernel Language](docs/markdown/hip_kernel_language.md) document uses the phrase "Under Development", and the [HIP Runtime API bug list](http://gpuopen-professionalcompute-tools.github.io/HIP/bug.html) lists known bugs. -Upcoming: -- Stability: Enforce periodic host synchronization to reclaim resources if the application has launched a large - number of commands (>1K) without synchronizing. -- Register keyword now silently ignored on HCC (previously would emit warning). -- Doc updates: Add some more frequently asked questions to FAQ, fix TOC in some files, review. -- Cookbook. =================================================================================================== diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h index 140cbb0678..212fd650a3 100644 --- a/include/hip/hcc_detail/host_defines.h +++ b/include/hip/hcc_detail/host_defines.h @@ -28,7 +28,6 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H #define HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H -#define USE_PROMOTE_FREE_HCC 1 // Add guard to Generic Grid Launch method #ifndef GENERIC_GRID_LAUNCH @@ -61,11 +60,7 @@ THE SOFTWARE. */ // _restrict is supported by the compiler #define __shared__ tile_static -#if USE_PROMOTE_FREE_HCC==1 #define __constant__ __attribute__((hc)) -#else -#define __constant__ ADDRESS_SPACE_1 -#endif #else // Non-HCC compiler diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index d826a0cec3..061714070e 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -48,9 +48,6 @@ THE SOFTWARE. #include "env.h" -// needs HCC change for hc::no_scope -#define USE_NO_SCOPE 1 - //================================================================================================= //Global variables: //================================================================================================= diff --git a/tests/src/hipPointerAttrib.cpp b/tests/src/hipPointerAttrib.cpp index 7a2ab64bea..bddbff5ce0 100644 --- a/tests/src/hipPointerAttrib.cpp +++ b/tests/src/hipPointerAttrib.cpp @@ -32,7 +32,6 @@ THE SOFTWARE. #endif -#define USE_AV_COPY (__hcc_workweek__ >= 16351) size_t Nbytes = 0; @@ -410,21 +409,13 @@ void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir remove if (addDir == Up) { for (char *p = basePtr; p=0; p-=bufferSize) { -#if USE_AV_COPY hc::AmPointerInfo info(p, p, bufferSize, acc, false, false); hc::am_memtracker_add(p, info); -#else - hc::am_memtracker_add(p, bufferSize, acc, false); -#endif } } diff --git a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp index a5d79464d0..9fadebea1e 100644 --- a/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp +++ b/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -36,7 +36,6 @@ THE SOFTWARE. #define USE_HCC_MEMTRACKER 0 #endif -#define USE_HSA_COPY 1 int elementSizes[] = {16, 1024,524288}; int nSizes = sizeof(elementSizes) / sizeof(int); @@ -102,11 +101,8 @@ void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_ hipStream_t stepAStream = gpu0Stream; if (stepAIsCopy) { -#ifdef USE_HSA_COPY HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0_0, sizeElements, hipMemcpyDeviceToDevice, stepAStream)); -#endif } else { - //assert(0); // not yet supported. unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, dataGpu0_0, dataGpu1, numElements); From 176ff824d1142afaad1e26702d043ffae40cb763 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 23 Jun 2017 10:39:16 -0500 Subject: [PATCH 162/171] Add option to pass names to HCC dispatch API (for debug) --- src/hip_module.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 2a3bfabc28..b8c032da27 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -451,7 +451,13 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, hc::completion_future cf; lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, - (startEvent || stopEvent) ? &cf : nullptr); + (startEvent || stopEvent) ? &cf : nullptr +#define USE_NAMED_KERNEL 0 +#if USE_NAMED_KERNEL + , f->_name.c_str() +#endif + ); + if (startEvent) { From dff260de7eb44011c8e5795fac5823af0192718d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 23 Jun 2017 17:12:04 -0500 Subject: [PATCH 163/171] Add docs for launch_bounds. --- docs/markdown/hip_faq.md | 1 - docs/markdown/hip_kernel_language.md | 55 +++++++++++++++++++++------- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/docs/markdown/hip_faq.md b/docs/markdown/hip_faq.md index 07ec5f1d8b..ddf70f2875 100644 --- a/docs/markdown/hip_faq.md +++ b/docs/markdown/hip_faq.md @@ -70,7 +70,6 @@ See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for - printf - assert - `__restrict__` - - `__launch_bounds__` - `__threadfence*_`, `__syncthreads*` - Unbounded loop unroll diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md index 3cb7b17a0c..0485188a1f 100644 --- a/docs/markdown/hip_kernel_language.md +++ b/docs/markdown/hip_kernel_language.md @@ -610,30 +610,59 @@ Device-side dynamic global memory allocation is under development. HIP now incl implementation of malloc and free that can be called from device functions. ## `__launch_bounds__` -GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) that are shared among the active warps. Using more resources can increase the kernel’s IPC, but it reduces the resources available for other warps and limits the number of warps that can run simultaneously. Thus, GPUs exhibit a complex relationship between resource usage and performance. `__launch_bounds__` allows the application to provide usage hints that influence the resources (primarily registers) employed by the generated code. It’s a function attribute that must be attached to a `__global__` function: + + +GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) which are shared by the actively running warps. Using more resources can increase IPC of the kernel but reduces the resources available for other warps and limits the number of warps that can be simulaneously running. Thus GPUs have a complex relationship between resource usage and performance. + +__hip_launch_bounds__ allows the application to provide usage hints that influence the resources (primarily registers) used by the generated code. +__hip_launch_bounds__ is a function attribute that must be attached to a __global__ function: ``` -__global__ void -`__launch_bounds__`(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) +__global__ void `__launch_bounds__`(MAX_THREADS_PER_BLOCK, MIN_WARPS_PER_EU) MyKernel(...) ... MyKernel(hipGridLaunch lp, ...) ... ``` -`__launch_bounds__` supports two parameters: +__launch_bounds__ supports two parameters: +- MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than MAX_THREADS_PER_BLOCK. (On NVCC this maps to the .maxntid PTX directive). If no launch_bounds is specified, MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying MAX_THREADS_PER_BLOCK less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation that supports all possible block sizes at launch time. +The threads-per-block is the product of (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z). +- MIN_WARPS_PER_EU - directs the compiler to minimize resource usage so that the requested number of warps can be simultaneously active on a multi-processor. Since active warps compete for the same fixed pool of resources, the compiler must reduce resources required by each warp(primarily registers). MIN_WARPS_PER_EU is optional and defaults to 1 if not specified. Specifying a MIN_WARPS_PER_EU greater than the default 1 effectively constrains the compiler's resource usage. -- **requiredMaxThreadsPerBlock**---the programmer guarantees that the kernel will launch with threadsPerBlock less than requiredMaxThreadsPerBlock. (In nvcc, this parameter maps to the _.maxntid_ PTX directive; in hcc, it maps to the HSAIL _requiredworkgroupsize_ directive.) If launch_bounds is unspecified, requiredMaxThreadsPerBlock is the maximum block size that the device supports (typically 1,024 or larger). Specifying requiredMaxThreadsPerBlock less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation supporting all possible block sizes at launch time. The threadsPerBlock value is the product hipBlockDim_x * hipBlockDim_y * hipBlockDim_z. -- **minBlocksPerMultiprocessor**---directs the compiler to minimize resource usage so that the requested number of blocks can be simultaneously active on a multiprocessor. Because active blocks compete for the same fixed resource pool, the compiler must reduce the resource requirements of each block (primarily registers). minBlocksPerMultiprocessor is optional and defaults to 1 if unspecified. Selecting a minBlocksPerMultiprocessor value greater than 1 effectively constrains the compiler's resource usage. +### Compiler Impact +The compiler uses these parameters as follows: +- The compiler uses the hints only to manage register usage, and does not automatically reduce shared memory or other resources. +- Compilation fails if compiler cannot generate a kernel which meets the requirements of the specified launch bounds. +- From MAX_THREADS_PER_BLOCK, the compiler derives the maximum number of warps/block that can be used at launch time. +Values of MAX_THREADS_PER_BLOCK less than the default allows the compiler to use a larger pool of registers : each warp uses registers, and this hint constains the launch to a warps/block size which is less than maximum. +- From MIN_WARPS_PER_EU, the compiler derives a maximum number of registers that can be used by the kernel (to meet the required #simultaneous active blocks). +If MIN_WARPS_PER_EU is 1, then the kernel can use all registers supported by the multiprocessor. +- The compiler ensures that the registers used in the kernel is less than both allowed maximums, typically by spilling registers (to shared or global memory), or by using more instructions. +- The compiler may use hueristics to increase register usage, or may simply be able to avoid spilling. The MAX_THREADS_PER_BLOCK is particularly useful in this cases, since it allows the compiler to use more registers and avoid situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size that is never used at launch time. -The compiler uses these two parameters as follows: -- It employs the hints only to manage register usage and does not automatically reduce shared memory or other resources. -- Compilation fails if the compiler cannot generate a kernel that meets the requirements of the specified launch bounds. -- From requiredMaxThreadsPerBlock, the compiler derives the maximum number of warps per block that are usable at launch time. Values less than the default allow the compiler to use a larger register pool: each warp uses registers, and this hint constrains the launch to a warps-per-block size less than maximum. -- From minBlocksPerMultiprocessor, the compiler derives a maximum number of registers that the kernel can use (to meet the required number of simultaneously active blocks). If the value is 1, the kernel can use all registers supported by the multiprocessor. -The compiler ensures that the kernel uses fewer registers than both allowed maxima specify, typically by spilling to shared memory or using more instructions. It may use heuristics to increase register usage or may simply be able to avoid spilling. The requiredMaxThreadsPerBlock parameter is particularly useful in this case, since it allows the compiler to use more registers---avoiding situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size never sees use at launch time. +### CU and EU Definitions +A compute unit (CU) is responsible for executing the waves of a work-group. It is composed of one or more execution units (EU) which are responsible for executing waves. An EU can have enough resources to maintain the state of more than one executing wave. This allows an EU to hide latency by switching between waves in a similar way to symmetric multithreading on a CPU. In order to allow the state for multiple waves to fit on an EU, the resources used by a single wave have to be limited. Limiting such resources can allow greater latency hiding, but can result in having to spill some register state to memory. This attribute allows an advanced developer to tune the number of waves that are capable of fitting within the resources of an EU. It can be used to ensure at least a certain number will fit to help hide latency, and can also be used to ensure no more than a certain number will fit to limit cache thrashing. + +### Porting from CUDA __launch_bounds +CUDA defines a __launch_bounds which is also designed to control occupancy: +``` +__launch_bounds(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MULTIPROCESSOR) +``` -HIP/hcc will parse the `launch_bounds` attribute but silently ignores the performance hint. Full support is under development. +- The second parameter __launch_bounds parameters must be converted to the format used __hip_launch_bounds, which uses warps and execution-units rather than blocks and multi-processors ( This conversion is performed automatically by the clang hipify tools.) +``` +MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK) / 32 +``` +The key differences in the interface are: +- Warps (rather than blocks): +The developer is trying to tell the compiler to control resource utilization to guarantee some amount of active Warps/EU for latency hiding. Specifying active warps in terms of blocks appears to hide the micro-architectural details of the warp size, but makes the interface more confusing since the developer ultimately needs to compute the number of warps to obtain the desired level of control. +- Execution Units (rather than multiProcessor): +The use of execution units rather than multiprocessors provides support for architectures with multiple execution units/multi-processor. For example, the AMD GCN architecture has 4 execution units per multiProcessor. The hipDeviceProps has a field executionUnitsPerMultiprocessor. +Platform-specific coding techniques such as #ifdef can be used to specify different launch_bounds for NVCC and HCC platforms, if desired. + + +### maxregcount Unlike nvcc, hcc does not support the "--maxregcount" option. Instead, users are encouraged to use the hip_launch_bounds directive since the parameters are more intuitive and portable than micro-architecture details like registers, and also the directive allows per-kernel control rather than an entire file. hip_launch_bounds works on both hcc and nvcc targets. From 522e059a79bc0c68fde0234f572a6691f220fb66 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 23 Jun 2017 19:05:34 -0500 Subject: [PATCH 164/171] fixed default args for symbol memcpy apis Change-Id: Ie0b63f8b9c5535eb3946bd6af3f30fe71a015244 --- include/hip/hcc_detail/hip_runtime_api.h | 8 ++++---- include/hip/nvcc_detail/hip_runtime_api.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index fde38c8395..724bf09b21 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -1194,7 +1194,7 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t siz * * @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyFromSymbol, hipMemcpyAsync, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync, hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync, hipMemcpyFromSymbolAsync */ -hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind); +hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyHostToDevice); /** @@ -1214,11 +1214,11 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t siz * * @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyFromSymbol, hipMemcpyAsync, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync, hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync, hipMemcpyFromSymbolAsync */ -hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream); +hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0); -hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind); +hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyDeviceToHost); -hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream); +hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0); /** * @brief Copy data from src to dst asynchronously. diff --git a/include/hip/nvcc_detail/hip_runtime_api.h b/include/hip/nvcc_detail/hip_runtime_api.h index f92523a3e3..b1011aac6c 100644 --- a/include/hip/nvcc_detail/hip_runtime_api.h +++ b/include/hip/nvcc_detail/hip_runtime_api.h @@ -360,16 +360,16 @@ inline static hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType))); } -inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, size_t offset, hipMemcpyKind copyType, hipStream_t stream) { - return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType))); +inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, size_t offset, hipMemcpyKind copyType, hipStream_t stream = 0) { + return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType), stream)); } -inline static hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind) +inline static hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyDeviceToHost) { return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind))); } -inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream) +inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0) { return hipCUDAErrorTohipError(cudaMemcpyFromSymbolAsync(dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind), stream)); } From 1df08626c896d6a89beedc098a1fa7bfda00be27 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Mon, 26 Jun 2017 15:29:38 -0500 Subject: [PATCH 165/171] Add support of HIP_HIDDEN_FREE_MEM, to deduct the returned available memory from hipMemGetInfo API, measured in MB. Change-Id: I7a8260c12e032e04e26611db4c38c893a29f2653 --- src/hip_hcc.cpp | 5 +++-- src/hip_hcc_internal.h | 2 +- src/hip_memory.cpp | 4 ++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 061714070e..364db80537 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -78,6 +78,7 @@ int HIP_FORCE_P2P_HOST = 0; int HIP_FAIL_SOC = 0; int HIP_DENY_PEER_ACCESS = 0; +int HIP_HIDDEN_FREE_MEM = 0; // Force async copies to actually use the synchronous copy interface. int HIP_FORCE_SYNC_COPY = 0; @@ -1204,8 +1205,8 @@ void HipReadEnv() tokenize(HIP_LAUNCH_BLOCKING_KERNELS, ',', &g_hipLaunchBlockingKernels); } READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed. Impacts hipMemcpyAsync, hipMemsetAsync." ); - - + + READ_ENV_I(release, HIP_HIDDEN_FREE_MEM, 0, "Amount of memory to hide from the free memory reported by hipMemGetInfo, specified in MB. Impacts hipMemGetInfo." ); READ_ENV_C(release, HIP_DB, 0, "Print debug info. Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback); if ((HIP_DB & (1<_acc, &deviceMemSize, &hostMemSize, &userMemSize); *free = device->_props.totalGlobalMem - deviceMemSize; + + // Deduct the amount of memory from the free memory reported from the system + if(HIP_HIDDEN_FREE_MEM) + *free -= (size_t)HIP_HIDDEN_FREE_MEM*1024*1024; } else { e = hipErrorInvalidValue; From 1c3a8b256469428fbbb8cfa80c0be1bfed405def Mon Sep 17 00:00:00 2001 From: sunway513 Date: Mon, 26 Jun 2017 22:47:22 +0000 Subject: [PATCH 166/171] Fix docs for HIP_TRACE_API bit masks. --- docs/markdown/hip_profiling.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md index ef349ef2a5..ac277c8433 100644 --- a/docs/markdown/hip_profiling.md +++ b/docs/markdown/hip_profiling.md @@ -268,9 +268,12 @@ PASSED! ``` HIP_TRACE_API supports multiple levels of debug information: - - 0x1 = print all HIP APIs - - 0x2 = print HIP APIs which initiate GPU kernels, copies, or memsets. Includes hipLaunchKernel, hipMemcpy*, hipMemset*. - - 0x4 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree. + - 0x1 = print all HIP APIs. This is the most verbose setting; the flags below allow selecting a subset. + - 0x2 = print HIP APIs which initiate GPU kernel commands. Includes hipLaunchKernel, hipLaunchModuleKernel + - 0x4 = print HIP APIs which initiate GPU memory commands. Includes hipMemcpy*, hipMemset*. + - 0x8 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree. + +These can be combined. For example, HIP_TRACE_API=6 shows a concise view of the HIP commands (both kernel and memory) that are sent to the GPU. #### Color From 148dbc1027b908bfc32eca7231c2b250878d9661 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 27 Jun 2017 12:17:12 -0500 Subject: [PATCH 167/171] Set default HIP_HIDDEN_FREE_MEM --- src/hip_hcc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp index 364db80537..be591f2f04 100644 --- a/src/hip_hcc.cpp +++ b/src/hip_hcc.cpp @@ -78,7 +78,7 @@ int HIP_FORCE_P2P_HOST = 0; int HIP_FAIL_SOC = 0; int HIP_DENY_PEER_ACCESS = 0; -int HIP_HIDDEN_FREE_MEM = 0; +int HIP_HIDDEN_FREE_MEM = 256; // Force async copies to actually use the synchronous copy interface. int HIP_FORCE_SYNC_COPY = 0; From 1e1654c225290793269f505cb67880f68c764407 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Tue, 27 Jun 2017 14:15:16 -0500 Subject: [PATCH 168/171] Remove some warning debug info and add weak attribute back to GGL __global__ define Change-Id: I2021b107dda697b1262d44fa1506465e94a3916b --- include/hip/hcc_detail/host_defines.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/include/hip/hcc_detail/host_defines.h b/include/hip/hcc_detail/host_defines.h index 212fd650a3..b2e7ac2617 100644 --- a/include/hip/hcc_detail/host_defines.h +++ b/include/hip/hcc_detail/host_defines.h @@ -41,13 +41,10 @@ THE SOFTWARE. #define __host__ __attribute__((cpu)) #define __device__ __attribute__((hc)) -//#warning "HOST DEFINE header included" #if GENERIC_GRID_LAUNCH == 0 -//#warning "original global define reached" #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else -//#warning "GGL global define reached" -#define __global__ __attribute__((annotate("hip__global__"), hc, used)) +#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) From f41b77e2475be47d52ed3bd97ac54ac84e50b27c Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 28 Jun 2017 10:26:04 +0530 Subject: [PATCH 169/171] [build] link libCXActivityLogger.so when COMPILE_HIP_ATP_MARKER=1 Change-Id: I0bfffd924cd858bec7436acf3ccb1e3375172f27 --- CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index eee1a14a8a..b3ea5a3ca3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -193,6 +193,9 @@ if(HIP_PLATFORM STREQUAL "hcc") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HCC_LD_FLAGS} -Wl,-Bsymbolic") #find_package(LLVM HINTS ${HCC_HOME}/compiler/lib/cmake) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 --amdgpu-target=gfx900") + if(COMPILE_HIP_ATP_MARKER) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -lCXLActivityLogger") + endif() add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME}) target_link_libraries(hip_hcc PRIVATE hc_am) #target_link_libraries(hip_hcc PUBLIC LLVMAMDGPUUtils) From 91f82ce541b51ad1106a28e951b410a256e61f62 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 28 Jun 2017 11:32:25 +0530 Subject: [PATCH 170/171] Update toc in markdown documentation Change-Id: I6da7053672b306442f3640fff3471efe25593870 --- docs/markdown/hip_bugs.md | 4 +++- docs/markdown/hip_kernel_language.md | 4 ++++ docs/markdown/hip_profiling.md | 5 ----- hipify-clang/README.md | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/docs/markdown/hip_bugs.md b/docs/markdown/hip_bugs.md index 91b2a5a019..78f0e53467 100644 --- a/docs/markdown/hip_bugs.md +++ b/docs/markdown/hip_bugs.md @@ -1,7 +1,9 @@ -# HIP Bugs +# HIP Bugs + - [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`](#errors-related-to-undefined-reference-to-__hclaunchkernel____grid_launch_parm) +- [Can't find kernels inside dynamic linked library](#cant-find-kernels-inside-dynamic-linked-library) - [What is the current limitation of HIP Generic Grid Launch method?](#what-is-the-current-limitation-of-hip-generic-grid-launch-method) - [Errors related to `no matching constructor`](#errors-related-to-no-matching-constructor) - [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions) diff --git a/docs/markdown/hip_kernel_language.md b/docs/markdown/hip_kernel_language.md index 0485188a1f..cfa5d0f871 100644 --- a/docs/markdown/hip_kernel_language.md +++ b/docs/markdown/hip_kernel_language.md @@ -40,6 +40,10 @@ - [Printf](#printf) - [Device-Side Dynamic Global Memory Allocation](#device-side-dynamic-global-memory-allocation) - [`__launch_bounds__`](#__launch_bounds__) + * [Compiler Impact](#compiler-impact) + * [CU and EU Definitions](#cu-and-eu-definitions) + * [Porting from CUDA __launch_bounds](#porting-from-cuda-__launch_bounds) + * [maxregcount](#maxregcount) - [Register Keyword](#register-keyword) - [Pragma Unroll](#pragma-unroll) - [In-Line Assembly](#in-line-assembly) diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md index ac277c8433..a659216044 100644 --- a/docs/markdown/hip_profiling.md +++ b/docs/markdown/hip_profiling.md @@ -23,11 +23,6 @@ This document starts with some of the general capabilities of CodeXL and then de - [Tracing and Debug](#tracing-and-debug) * [Tracing HIP APIs](#tracing-hip-apis) + [Color](#color) - * [Using HIP_DB](#using-hip_db) - * [Using ltrace](#using-ltrace) - * [Chicken bits](#chicken-bits) - * [Debugging HIP Applications](#debugging-hip-applications) - * [General Debugging Tips](#general-debugging-tips) diff --git a/hipify-clang/README.md b/hipify-clang/README.md index d74c53f187..20456f3bff 100644 --- a/hipify-clang/README.md +++ b/hipify-clang/README.md @@ -5,7 +5,7 @@ - [Using hipify-clang](#using-hipify-clang) * [Build and install](#build-and-install) * [Running and using hipify-clang](#running-and-using-hipify-clang) - + [Disclaimer](#disclaimer) +- [Disclaimer](#disclaimer) From 6674ef34a89aa5e7a23edce758381157f056d52e Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 28 Jun 2017 11:32:40 +0530 Subject: [PATCH 171/171] Update release notes Change-Id: I5001ef03692159fcf9825102b37066ec26e6b8d2 --- RELEASE.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/RELEASE.md b/RELEASE.md index d6f3ec594c..a1e580b7b0 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -8,12 +8,15 @@ We have attempted to document known bugs and limitations - in particular the [HI ## Revision History: =================================================================================================== +Release: 1.2 +Date: 2017.06.29 - new APIs: hipMemcpy2DAsync, hipMallocPitch, hipHostMallocCoherent, hipHostMallocNonCoherent - added support for building hipify-clang using clang 3.9 - hipify-clang updates for CUDA 8.0 runtime+driver support - renamed hipify to hipify-perl - initial implementation of hipify-cmakefile - several documentation updates & bug fixes +- support for abort() function in device code ===================================================================================================