From c41cf40c4215911ac7a0bea97d49cfe668ddd82e Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 3 Apr 2017 20:32:50 +0300 Subject: [PATCH 001/171] [HIPIFY] Remove hipLaunchParm in HIP kernel declaration. [ROCm/clr commit: a9268008acd094e052d184dfc148b5930e5bbb42] --- projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 6c24fbf288..27447b8d8c 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -2343,9 +2343,6 @@ private: LangOptions DefaultLangOptions; SmallString<40> XStr; raw_svector_ostream OS(XStr); - StringRef initialParamList; - OS << "hipLaunchParm lp"; - size_t repLength = OS.str().size(); SourceLocation sl = kernelDecl->getNameInfo().getEndLoc(); SourceLocation kernelArgListStart = Lexer::findLocationAfterToken(sl, tok::l_paren, *SM, DefaultLangOptions, true); DEBUG(dbgs() << kernelArgListStart.printToString(*SM)); @@ -2355,14 +2352,12 @@ private: SourceLocation kernelArgListStart(pvdFirst->getLocStart()); SourceLocation kernelArgListEnd(pvdLast->getLocEnd()); SourceLocation stop = Lexer::getLocForEndOfToken(kernelArgListEnd, 0, *SM, DefaultLangOptions); - repLength += SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart); - initialParamList = StringRef(SM->getCharacterData(kernelArgListStart), repLength); - OS << ", " << initialParamList; + size_t repLength = SM->getCharacterData(stop) - SM->getCharacterData(kernelArgListStart); + OS << StringRef(SM->getCharacterData(kernelArgListStart), repLength); + Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str()); + FullSourceLoc fullSL(sl, *(Result.SourceManager)); + insertReplacement(Rep0, fullSL); } - DEBUG(dbgs() << "initial paramlist: " << initialParamList << "\n" << "new paramlist: " << OS.str() << "\n"); - Replacement Rep0(*(Result.SourceManager), kernelArgListStart, repLength, OS.str()); - FullSourceLoc fullSL(sl, *(Result.SourceManager)); - insertReplacement(Rep0, fullSL); } bool cudaCall(const MatchFinder::MatchResult &Result) { From 7eb7ef2d8eb8d9d1a1bbfe3d06fedb2e97c5bd16 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 3 Apr 2017 22:05:01 +0300 Subject: [PATCH 002/171] [HIPIFY] GGL support + hipLaunchKernel -> hipLaunchKernelGGL + macro HIP_KERNEL_NAME is no longer used [ROCm/clr commit: 81415a41b807f9e25b048ef17b6e5a77022c47b5] --- projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 27447b8d8c..383af0440c 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -2426,9 +2426,9 @@ private: XStr.clear(); if (calleeName.find(',') != StringRef::npos) { SmallString<128> tmpData; - calleeName = Twine("HIP_KERNEL_NAME(" + calleeName + ")").toStringRef(tmpData); + calleeName = Twine("(" + calleeName + ")").toStringRef(tmpData); } - OS << "hipLaunchKernel(" << calleeName << ","; + OS << "hipLaunchKernelGGL(" << calleeName << ","; const CallExpr *config = launchKernel->getConfig(); DEBUG(dbgs() << "Kernel config arguments:" << "\n"); SourceManager *SM = Result.SourceManager; @@ -2468,7 +2468,7 @@ private: Replacement Rep(*SM, launchKernel->getLocStart(), length, OS.str()); FullSourceLoc fullSL(launchKernel->getLocStart(), *SM); insertReplacement(Rep, fullSL); - hipCounter counter = {"hipLaunchKernel", CONV_KERN, API_RUNTIME}; + hipCounter counter = {"hipLaunchKernelGGL", CONV_KERN, API_RUNTIME}; updateCounters(counter, refName.str()); return true; } From 14c38df50041a51b70c6f5777cdf17ba6f366bf0 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 4 Apr 2017 08:06:09 +0530 Subject: [PATCH 003/171] hip_doc packaging script updates - Gracefully handle missing doxygen and grip tools Change-Id: I1a4a653d687c136c6d9237062ab4d02bc6cb3db1 [ROCm/clr commit: b54954b09f19345efb0b7de2b8ddd617b72ff315] --- projects/clr/hipamd/packaging/hip_doc.txt | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/packaging/hip_doc.txt b/projects/clr/hipamd/packaging/hip_doc.txt index bbcaf54ec8..d5a0c471b1 100644 --- a/projects/clr/hipamd/packaging/hip_doc.txt +++ b/projects/clr/hipamd/packaging/hip_doc.txt @@ -1,12 +1,19 @@ cmake_minimum_required(VERSION 2.8.3) project(hip_doc) -add_custom_target(build_doxygen ALL +find_program(DOXYGEN_EXE doxygen) +if(DOXYGEN_EXE) + add_custom_target(build_doxygen ALL COMMAND HIP_PATH=@hip_SOURCE_DIR@ doxygen @hip_SOURCE_DIR@/docs/doxygen-input/doxy.cfg) -add_custom_target(convert_md_to_html ALL + install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI) +endif() + +find_program(GRIP_EXE grip) +if(GRIP_EXE) + add_custom_target(convert_md_to_html ALL COMMAND @hip_SOURCE_DIR@/packaging/convert_md_to_html.sh @hip_SOURCE_DIR@ ${PROJECT_BINARY_DIR}/md2html) -install(DIRECTORY RuntimeAPI/html DESTINATION docs/docs/RuntimeAPI) -install(DIRECTORY md2html/ DESTINATION docs) + install(DIRECTORY md2html/ DESTINATION docs) +endif() ############################# # Packaging steps From 82965fb2d6c3ca529fcb53772b882f307aed2b10 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 4 Apr 2017 08:07:56 +0530 Subject: [PATCH 004/171] Remove unused packaging scripts Change-Id: I609391b905810eb24f7fd4ea6d7f27166ca001b3 [ROCm/clr commit: 714dd99e2dca23c6f1e407ee067fd8294b243674] --- .../packaging/create_hip_samples_installer.sh | 23 ------------------- 1 file changed, 23 deletions(-) delete mode 100755 projects/clr/hipamd/packaging/create_hip_samples_installer.sh diff --git a/projects/clr/hipamd/packaging/create_hip_samples_installer.sh b/projects/clr/hipamd/packaging/create_hip_samples_installer.sh deleted file mode 100755 index 91789d2524..0000000000 --- a/projects/clr/hipamd/packaging/create_hip_samples_installer.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash -function die { - echo "${1-Died}." >&2 - exit 1 -} - -payload=$1 -script=$2 -[ "$payload" != "" ] || [ "$script" != "" ] || die "Invalid arguments!" -tmp=__extract__$RANDOM - -printf "#!/bin/bash -samples_dir=\$1 -[ \"\$samples_dir\" != \"\" ] || read -e -p \"Enter the path to extract the HIP samples: \" samples_dir -mkdir -p \$samples_dir -PAYLOAD=\`awk '/^__PAYLOAD_BELOW__/ {print NR + 1; exit 0; }' \$0\` -tail -n+\$PAYLOAD \$0 | tar -xz -C \$samples_dir -echo \"HIP samples installed in \$samples_dir\" -exit 0 -__PAYLOAD_BELOW__\n" > "$tmp" - -cat "$tmp" "$payload" > "$script" && rm "$tmp" -chmod +x "$script" From a14593b331e351916a5360035c911cc594b3bb72 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 4 Apr 2017 15:51:10 +0530 Subject: [PATCH 005/171] mgpu IPC support fix Change-Id: I12e4b2fd189c3658efd3b07defa18ece3853b0eb [ROCm/clr commit: 4906cd5f0d526a2d464709aa1be393122c23df18] --- projects/clr/hipamd/src/hip_memory.cpp | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 805fc9efc0..da5530349f 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -1260,10 +1260,15 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle; //Attach ipc memory - hsa_status_t hsa_status = - hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, 1, agent, devPtr); - if(hsa_status != HSA_STATUS_SUCCESS) - hipStatus = hipErrorMapBufferObjectFailed; + auto ctx= ihipGetTlsDefaultCtx(); + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + // the peerCnt always stores self so make sure the trace actually + hsa_status_t hsa_status = + hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr); + if(hsa_status != HSA_STATUS_SUCCESS) + hipStatus = hipErrorMapBufferObjectFailed; + } #else hipStatus = hipErrorRuntimeOther; #endif From 19c64a41a38efda861c187729045e589e98c3cb6 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Wed, 5 Apr 2017 16:23:27 -0500 Subject: [PATCH 006/171] hip_debug.md update from Alex, regarding __device__ function restrictions Change-Id: I5e54fd97fc632d4283f76282e3935396a1aad235 [ROCm/clr commit: bfe499a13b4b727e44458a09492c328c3e53055d] --- projects/clr/hipamd/docs/markdown/hip_bugs.md | 90 ++++++++++++++++++- 1 file changed, 89 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_bugs.md b/projects/clr/hipamd/docs/markdown/hip_bugs.md index e15c37fc54..addf2c17f6 100644 --- a/projects/clr/hipamd/docs/markdown/hip_bugs.md +++ b/projects/clr/hipamd/docs/markdown/hip_bugs.md @@ -5,6 +5,7 @@ - [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**](#errors-related-to-undefined-reference-to-hclaunchkernel__grid_launch_parm) - [Application hangs after a hipLaunchKernel call](#what-if-i-see-application-hangs-after-a-hiplaunchkernel-call) - [What is the current limitation of HIP Generic Grid Launch method?](#what-is-the-current-limitation-of-hip-generic-grid-launch-method) +- [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions) @@ -46,4 +47,91 @@ hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_H ### What is the current limitation of HIP Generic Grid Launch method? 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak))); -2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch. \ No newline at end of file +2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch. + + +### HIP is more restrictive in enforcing restrictions +By the language specification, both for HIP and CUDA it is forbidden to call a +`__device__` function in a `__host__` context. In practice, you may observe +differences in the strictness of this restriction, with HIP exhibiting a tighter +adherence to the specification i.e. being less tolerant of infringing code. The +solution is to always ensure that all functions which are to be called in a +`__device__` context are correctly annotated to reflect it. An interesting case +where these differences emerge is shown below (this has been lifted from +production code, and relies on a the common [C++ Member Detector idiom][1], as it +would be implemented pre C++11): +```c++ +#include +#include + +struct meta_yes { char a[1]; }; +struct meta_no { char a[2]; }; + +// Dual restriction is necessary in HIP if the detector is to work for +// __device__ contexts as well as __host__ ones. NVCC is less strict. +template +__host__ __device__ +const T& return_ref(); + +template +struct has_nullary_operator { + // Dual restriction is necessary in HIP if the detector is to work for + // __device__ contexts as well as __host__ ones. NVCC is less strict. + template + __host__ __device__ + static + meta_yes testFunctor( + C const *, + typename std::enable_if< + (sizeof(return_ref().operator()()) > 0)>::type* = nullptr); + static + meta_no testFunctor(...); + + enum { + value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; +}; + +template< + typename Scalar, + typename NullaryOp, + bool has_nullary = has_nullary_operator::value> +struct nullary_wrapper { + template + T packetOp() const { return T{1}; } +}; + + +template +struct nullary_wrapper { + template + T packetOp() const { return T{10}; } +}; + +// This specialisation will fail to compile. +template +struct nullary_wrapper {}; + +template +struct UniformRandomGenerator; + +template<> struct UniformRandomGenerator { + float operator()() const [[hc]] { return 42.0; } +}; + +__device__ +void this_will_not_compile_if_detector_is_not_marked_device() +{ + float f = + nullary_wrapper< + float, UniformRandomGenerator>().packetOp(); +} + +__host__ +void this_will_not_compile_if_detector_is_marked_device_only() +{ + float f = + nullary_wrapper< + float, UniformRandomGenerator>().packetOp(); +} +``` +[1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector \ No newline at end of file From 2dbc9b1d0823395a181349c441e3fd7fc4dfafdd Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 5 Apr 2017 17:43:20 -0500 Subject: [PATCH 007/171] Add bug descrip for "no matching constructor" [ROCm/clr commit: f345cd6f47644c56f18095e099e9939caa2c6977] --- projects/clr/hipamd/docs/markdown/hip_bugs.md | 66 ++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_bugs.md b/projects/clr/hipamd/docs/markdown/hip_bugs.md index addf2c17f6..234dec4e0e 100644 --- a/projects/clr/hipamd/docs/markdown/hip_bugs.md +++ b/projects/clr/hipamd/docs/markdown/hip_bugs.md @@ -50,6 +50,70 @@ hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_H 2. using the macro based dispatch mechanism i.e. hipLaunchKernel* only works for functions that take no more than 20 arguments (this limit can be increased up to 126, and is temporary until we can enable C++14 mode and use variadic generic lambdas); no such limitation applies do dispatching directly through grid_launch. +### Errors related to `no matching constructor` + +The symptom is the compiler would complain about errors like `no matching constructor` for classes/structs passed as arguments into a GPU kernel. Often, this is caused by a design limitation in HCC where array-typed member variables inside a class/struct can’t be correctly passed into GPU kernels. To mitigate this issue, a custom serializer/deserializer pair is provided. + +For example, `Foo` in the code snippets below contains an array-typed member variable `table`, which would fail the compiler if used as a kernel argument. + +``` +struct Foo { + // table is an array, which makes foo + int table[3]; +}; +``` + +An workaround is to provide a custom serializer on CPU side, and append the contents of the array as kernel arguments: + +``` + +struct Foo { + int table[3]; + + // user-provided CPU serializer + // must append the contents of the array member as kernel arguments +#ifdef __HCC__ + __attribute__((annotate(“serialize”))) + void __cxxamp_serialize(Kalmar::Serialize &s) const { + for (int i = 0; i < 3; ++i) + s.Append(sizeof(int), &table[i]); + } +#endif +}; +``` + +Then, provide a custom deserializer on GPU side, to help reconstruct the array within GPU kernels. Notice that the deserializer can not be a function template, and should have scalar-typed parameters of the number equals to the length of the array-typed member variable. For example: + +``` +struct Foo { + int table[3]; + + // user-provided GPU deserializer + // table has 3 int elements, so deserializer must have 3 int parameters. +#ifdef __HCC__ + __attribute__((annotate(“user_deserialize”))) + Foo(int x0, int x1, int x2) [[cpu]][[hc]] { + table[0] = x0; + table[1] = x1; + table[2] = x2; + } +#endif + +#ifdef __HCC__ + __attribute__((annotate(“serialize”))) + void __cxxamp_serialize(Kalmar::Serialize &s) const { + s.Append(sizeof(int), &table[0]); + s.Append(sizeof(int), &table[1]); + s.Append(sizeof(int), &table[2]); + } +#endif +}; +``` + + +Rather than create serializer functions, another workaround is to pass the member fields from the structure as simple data types. + + ### HIP is more restrictive in enforcing restrictions By the language specification, both for HIP and CUDA it is forbidden to call a `__device__` function in a `__host__` context. In practice, you may observe @@ -134,4 +198,4 @@ void this_will_not_compile_if_detector_is_marked_device_only() float, UniformRandomGenerator>().packetOp(); } ``` -[1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector \ No newline at end of file +[1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector From 251db89f03f2db575bb3063cc7715eda96d4dbba Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 5 Apr 2017 19:40:00 -0500 Subject: [PATCH 008/171] Doc cleanup & add bug descript for restrict specifier issue [ROCm/clr commit: 294ffbb51f1e67d9ebae30e4cf37bfe330c18075] --- projects/clr/hipamd/docs/markdown/hip_bugs.md | 92 +++++++------------ 1 file changed, 34 insertions(+), 58 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_bugs.md b/projects/clr/hipamd/docs/markdown/hip_bugs.md index 234dec4e0e..c53b68d796 100644 --- a/projects/clr/hipamd/docs/markdown/hip_bugs.md +++ b/projects/clr/hipamd/docs/markdown/hip_bugs.md @@ -15,9 +15,11 @@ Some common code practices may lead to hipcc generating a error with the form : undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_ To workaround, try: -- Avoid calling hcLaunchKernel from a function with the __host__ attribute +- Avoid calling hipLaunchKernel from a function with the __host__ attribute +``` __host__ MyFunc(…) { hipLaunchKernel(myKernel, …) +``` - Avoid use of static with kernel definition: static __global__ MyKernel - Avoid defining kernels in anonymous namespace @@ -25,25 +27,6 @@ namespace { __global__ MyKernel … - Avoid calling member functions -If hipLaunchKernel takes parameters that request explicitly memcpy, then it will cause application hang. -Reason is that the hipLaunchKernel macro locks the stream. -If kernel paramters are actually function calls which invoke other hip apis (i.e. memcpy) to the same stream, then deadlock occurs. - -To workaround, try: -Move the function calls so they occur outside the hipLaunchKernel macro, store results in temps, then use the tems inside the kernel. - -``` -// Example pseudo code causing system hang: -// "bottom[0]->gpu_data()" calls hipMemcpy() implicitly and using the same stream, cause deadlock condition. -hipLaunchKernel(HIP_KERNEL_NAME(LRNComputeDiff),dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads, - bottom[0]->gpu_data()); - -// Move "gpu_data()" ouside of hipLaunchKernel to avoid hang. -auto bot_gpu_data = bottom[0]->gpu_data(); -hipLaunchKernel( LRNComputeDiff, dim3(CAFFE_GET_BLOCKS(n_threads)), dim3(CAFFE_HIP_NUM_THREADS), 0, 0, n_threads, - bot_gpu_data); - -``` ### What is the current limitation of HIP Generic Grid Launch method? 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak))); @@ -115,87 +98,80 @@ Rather than create serializer functions, another workaround is to pass the membe ### HIP is more restrictive in enforcing restrictions -By the language specification, both for HIP and CUDA it is forbidden to call a +The language specification for HIP and CUDA forbid calling a `__device__` function in a `__host__` context. In practice, you may observe differences in the strictness of this restriction, with HIP exhibiting a tighter -adherence to the specification i.e. being less tolerant of infringing code. The -solution is to always ensure that all functions which are to be called in a +adherence to the specification and thus less tolerant of infringing code. The +solution is to ensure that all functions which are called in a `__device__` context are correctly annotated to reflect it. An interesting case -where these differences emerge is shown below (this has been lifted from -production code, and relies on a the common [C++ Member Detector idiom][1], as it -would be implemented pre C++11): +where these differences emerge is shown below. This relies on a the common +[C++ Member Detector idiom][1], as it would be implemented pre C++11): + ```c++ #include #include -struct meta_yes { char a[1]; }; -struct meta_no { char a[2]; }; +struct aye { bool a[1]; }; +struct nay { bool a[2]; }; // Dual restriction is necessary in HIP if the detector is to work for // __device__ contexts as well as __host__ ones. NVCC is less strict. template __host__ __device__ -const T& return_ref(); +const T& cref_t(); template -struct has_nullary_operator { +struct Has_call_operator { // Dual restriction is necessary in HIP if the detector is to work for // __device__ contexts as well as __host__ ones. NVCC is less strict. template __host__ __device__ static - meta_yes testFunctor( + aye test( C const *, typename std::enable_if< - (sizeof(return_ref().operator()()) > 0)>::type* = nullptr); + (sizeof(cref_t().operator()()) > 0)>::type* = nullptr); static - meta_no testFunctor(...); + nay test(...); - enum { - value = sizeof(testFunctor(static_cast(0))) == sizeof(meta_yes) }; + enum { value = sizeof(test(static_cast(0))) == sizeof(aye) }; }; -template< - typename Scalar, - typename NullaryOp, - bool has_nullary = has_nullary_operator::value> -struct nullary_wrapper { - template - T packetOp() const { return T{1}; } +template::value> +struct Wrapper { + template + V f() const { return T{1}; } }; -template -struct nullary_wrapper { - template - T packetOp() const { return T{10}; } +template +struct Wrapper { + template + V f() const { return T{10}; } }; -// This specialisation will fail to compile. -template -struct nullary_wrapper {}; +// This specialisation will yield a compile-time error, if selected. +template +struct Wrapper {}; template -struct UniformRandomGenerator; +struct Functor; -template<> struct UniformRandomGenerator { - float operator()() const [[hc]] { return 42.0; } +template<> struct Functor { + __device__ + float operator()() const { return 42.0f; } }; __device__ void this_will_not_compile_if_detector_is_not_marked_device() { - float f = - nullary_wrapper< - float, UniformRandomGenerator>().packetOp(); + float f = Wrapper>().f(); } __host__ void this_will_not_compile_if_detector_is_marked_device_only() { - float f = - nullary_wrapper< - float, UniformRandomGenerator>().packetOp(); + float f = Wrapper>().f(); } ``` [1]: https://en.wikibooks.org/wiki/More_C%2B%2B_Idioms/Member_Detector From 2f050e56f8bb477aa19636d7499b3765a6a7dc0b Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 5 Apr 2017 21:59:11 -0500 Subject: [PATCH 009/171] Update bug workarounds to reflect tool improvements. [ROCm/clr commit: e8f4f7664915e12cfec8d5ebf61c1df7cd50afe3] --- projects/clr/hipamd/docs/markdown/hip_bugs.md | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_bugs.md b/projects/clr/hipamd/docs/markdown/hip_bugs.md index c53b68d796..14f2935f17 100644 --- a/projects/clr/hipamd/docs/markdown/hip_bugs.md +++ b/projects/clr/hipamd/docs/markdown/hip_bugs.md @@ -9,23 +9,23 @@ -### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm** +### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**` Some common code practices may lead to hipcc generating a error with the form : undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_ -To workaround, try: -- Avoid calling hipLaunchKernel from a function with the __host__ attribute -``` -__host__ MyFunc(…) { -hipLaunchKernel(myKernel, …) -``` +Suggested workarounds: - Avoid use of static with kernel definition: +```c++ static __global__ MyKernel -- Avoid defining kernels in anonymous namespace +``` + +- Avoid defining kernels in anonymous namespace : +```c++ namespace { -__global__ MyKernel … -- Avoid calling member functions + __global__ MyKernel +} +``` ### What is the current limitation of HIP Generic Grid Launch method? From 388f35c67ac9d8ef219f54cfa0a99a7dc025db19 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Wed, 5 Apr 2017 22:25:41 -0500 Subject: [PATCH 010/171] add extra guard to grid_launch_GGL header Change-Id: I120619c08ea2d084804fcb1639efbe6c4648dde9 [ROCm/clr commit: 335e107f5f86efa0996e30ee9bb26af7e3c4e2f2] --- projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index 4fd7c3ff3a..1d765dfc48 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -21,6 +21,7 @@ THE SOFTWARE. */ #pragma once +#if GENERIC_GRID_LAUNCH == 1 #include "concepts.hpp" #include "helpers.hpp" @@ -851,3 +852,4 @@ namespace hip_impl ##__VA_ARGS__);\ } } +#endif //GENERIC_GRID_LAUNCH From d1c334df6602d0792395fb81fd2df714b6399cb8 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 6 Apr 2017 09:29:44 -0500 Subject: [PATCH 011/171] added __host__ for complex functions and corrected memset and memcpy test Change-Id: I9ffefb7a0025aa111a54d20d2766982df15532e7 [ROCm/clr commit: 42739c37efa346aa366e26d1ad6cd99b01c8789b] --- .../include/hip/hcc_detail/hip_complex.h | 50 +++++++++---------- .../tests/src/deviceLib/hipDeviceMemcpy.cpp | 45 ++++++++++++----- 2 files changed, 58 insertions(+), 37 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h index 9ff75d381a..dd742e484c 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h @@ -177,45 +177,45 @@ COMPLEX_SCALAR_PRODUCT(hipDoubleComplex, unsigned long long) #endif -__device__ static inline float hipCrealf(hipFloatComplex z){ +__device__ __host__ static inline float hipCrealf(hipFloatComplex z){ return z.x; } -__device__ static inline float hipCimagf(hipFloatComplex z){ +__device__ __host__ static inline float hipCimagf(hipFloatComplex z){ return z.y; } -__device__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){ +__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b){ hipFloatComplex z; z.x = a; z.y = b; return z; } -__device__ static inline hipFloatComplex hipConjf(hipFloatComplex z){ +__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z){ hipFloatComplex ret; ret.x = z.x; ret.y = -z.y; return ret; } -__device__ static inline float hipCsqabsf(hipFloatComplex z){ +__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z){ return z.x * z.x + z.y * z.y; } -__device__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){ +__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q){ return make_hipFloatComplex(p.x + q.x, p.y + q.y); } -__device__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){ +__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q){ return make_hipFloatComplex(p.x - q.x, p.y - q.y); } -__device__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){ +__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q){ return make_hipFloatComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); } -__device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){ +__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q){ float sqabs = hipCsqabsf(q); hipFloatComplex ret; ret.x = (p.x * q.x + p.y * q.y)/sqabs; @@ -223,51 +223,51 @@ __device__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatCom return ret; } -__device__ static inline float hipCabsf(hipFloatComplex z){ +__device__ __host__ static inline float hipCabsf(hipFloatComplex z){ return sqrtf(hipCsqabsf(z)); } -__device__ static inline double hipCreal(hipDoubleComplex z){ +__device__ __host__ static inline double hipCreal(hipDoubleComplex z){ return z.x; } -__device__ static inline double hipCimag(hipDoubleComplex z){ +__device__ __host__ static inline double hipCimag(hipDoubleComplex z){ return z.y; } -__device__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){ +__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b){ hipDoubleComplex z; z.x = a; z.y = b; return z; } -__device__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){ +__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){ hipDoubleComplex ret; ret.x = z.x; ret.y = z.y; return ret; } -__device__ static inline double hipCsqabs(hipDoubleComplex z){ +__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z){ return z.x * z.x + z.y * z.y; } -__device__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){ +__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q){ return make_hipDoubleComplex(p.x + q.x, p.y + q.y); } -__device__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){ +__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q){ return make_hipDoubleComplex(p.x - q.x, p.y - q.y); } -__device__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){ +__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q){ return make_hipDoubleComplex(p.x * q.x - p.y * q.y, p.y * q.x + p.x * q.y); } -__device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){ +__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q){ double sqabs = hipCsqabs(q); hipDoubleComplex ret; ret.x = (p.x * q.x + p.y * q.y)/sqabs; @@ -275,28 +275,28 @@ __device__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleC return ret; } -__device__ static inline double hipCabs(hipDoubleComplex z){ +__device__ __host__ static inline double hipCabs(hipDoubleComplex z){ return sqrtf(hipCsqabs(z)); } typedef hipFloatComplex hipComplex; -__device__ static inline hipComplex make_hipComplex(float x, +__device__ __host__ static inline hipComplex make_hipComplex(float x, float y){ return make_hipFloatComplex(x, y); } -__device__ static inline hipFloatComplex hipComplexDoubleToFloat +__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat (hipDoubleComplex z){ return make_hipFloatComplex((float)z.x, (float)z.y); } -__device__ static inline hipDoubleComplex hipComplexFloatToDouble +__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble (hipFloatComplex z){ return make_hipDoubleComplex((double)z.x, (double)z.y); } -__device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){ +__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r){ float real = (p.x * q.x) + r.x; float imag = (q.x * p.y) + r.y; @@ -306,7 +306,7 @@ __device__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComp return make_hipComplex(real, imag); } -__device__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){ +__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){ float real = (p.x * q.x) + r.x; float imag = (q.x * p.y) + r.y; diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp index 54fd02c0c2..3843c07bb9 100644 --- a/projects/clr/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp @@ -1,18 +1,29 @@ -#include +#include #include "hip/hip_runtime.h" #include "hip/hip_runtime_api.h" +#include "../test_common.h" + #define LEN 1030 #define SIZE LEN << 2 -__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In, uint32_t *Vald) +/* HIT_START + * BUILD: %t %s ../test_common.cpp + * RUN: %t + * HIT_END + */ + + +__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In) { - memcpy(Out, In, SIZE, Vald); + int tx = hipThreadIdx_x; + memcpy(Out + tx, In + tx, SIZE/LEN); } __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size) { - memset(ptr, val, size); + int tx = hipThreadIdx_x; + memset(ptr + tx, val, size); } int main() @@ -24,19 +35,29 @@ int main() Val = new uint32_t; *Val = 0; for(int i=0;i Date: Thu, 6 Apr 2017 10:48:11 -0500 Subject: [PATCH 012/171] GGL update, add while 0 guard for hipLaunchKernel API Change-Id: Ie48ef8ca2ab5e26a51febfcd92417902c33fbf66 [ROCm/clr commit: 49a38da6e30c5227f1d9967b7e4fd27809f52ed8] --- .../hip/hcc_detail/grid_launch_GGL.hpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index 1d765dfc48..8f1abbb70b 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -841,15 +841,16 @@ namespace hip_impl group_mem_bytes,\ stream,\ ...)\ - {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } + do {\ + hipLaunchKernelGGL(\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + hipLaunchParm{},\ + ##__VA_ARGS__);\ + } while(0) + } #endif //GENERIC_GRID_LAUNCH From 950dfd28d6bf00a84577dfa999da7f169af500a3 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 6 Apr 2017 16:43:26 -0500 Subject: [PATCH 013/171] fixed header structure for complex data types Change-Id: I16bf19005d933f42e8c8603c5d0b2df8ea3ad04f [ROCm/clr commit: 1358bd8f076d5ddb23e7d4a3f43b0211a6391a3b] --- projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h | 3 +-- projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h | 2 +- projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h index dd742e484c..26d73a21a8 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h @@ -23,8 +23,7 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H -#include "./hip_fp16.h" -#include "./hip_vector_types.h" +#include "hip/hcc_detail/hip_vector_types.h" #if __cplusplus #define COMPLEX_ADD_OP_OVERLOAD(type) \ diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h index febc1b4fce..0a861b64af 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h @@ -23,7 +23,7 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H -#include "hip/hip_runtime.h" +#include "hip/hcc_detail/hip_vector_types.h" #if __clang_major__ > 3 diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h index 42e1d6663c..82bd3b2d6f 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h @@ -32,7 +32,7 @@ THE SOFTWARE. #error("This version of HIP requires a newer version of HCC."); #endif -#include "host_defines.h" +#include "hip/hcc_detail/host_defines.h" #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ __device__ __host__ type() {} \ From c2acfe24ccbb9a37a5ecbb51b35bc58dae6415f9 Mon Sep 17 00:00:00 2001 From: sunway513 Date: Thu, 6 Apr 2017 23:54:00 +0000 Subject: [PATCH 014/171] Improve documentation for hipModuleLaunch functions. Change-Id: I0e22621e499775740c3301347b7416d5f98c2414 [ROCm/clr commit: fb4e2307f1346d642a48d0ff3432b510f3084d2f] --- .../hipamd/include/hip/hcc_detail/hip_hcc.h | 44 ++++++++++++++++++- .../include/hip/hcc_detail/hip_runtime_api.h | 23 +++++----- 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_hcc.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_hcc.h index 645e980376..889e04eb9f 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_hcc.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_hcc.h @@ -28,6 +28,17 @@ THE SOFTWARE. #if __cplusplus #ifdef __HCC__ #include + + +/** + *------------------------------------------------------------------------------------------------- + *------------------------------------------------------------------------------------------------- + * @defgroup HCC-specific features + * @warning These APIs provide access to special features of HCC compiler and are not available through the CUDA path. + * @{ + */ + + /** * @brief Return hc::accelerator associated with the specified deviceId * @return #hipSuccess, #hipErrorInvalidDevice @@ -45,6 +56,30 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a #endif // #ifdef __HCC__ +/** + * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra + * + * @param [in[ f Kernel to launch. + * @param [in] gridDimX X grid dimension specified in work-items + * @param [in] gridDimY Y grid dimension specified in work-items + * @param [in] gridDimZ Z grid dimension specified in work-items + * @param [in] blockDimX X block dimensions specified in work-items + * @param [in] blockDimY Y grid dimension specified in work-items + * @param [in] blockDimZ Z grid dimension specified in work-items + * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. + * @param [in] kernelParams + * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. + * @param [in] startEvent If non-null, specified event will be updated to track the start time of the kernel launch. The event must be created before calling this API. + * @param [in] stopEvent If non-null, specified event will be updated to track the stop time of the kernel launch. The event must be created before calling this API. + * + * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue + + * If startNanos or stopNanos is specified, this API will record and return the start and stop timestamps for the command. The timestamps are collected on the GPU device + * and converted into ns resolution. Typically programs will specify both pointers. Collecting performance timestamps may have a small overhead (approx 1us). + * + * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage. + */ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, @@ -55,8 +90,15 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, size_t sharedMemBytes, hipStream_t hStream, void **kernelParams, - void **extra); + void **extra, + uint64_t *startNanos=nullptr, + uint64_t *stopNanos=nullptr + ); +// doxygen end HCC-specific features +/** + * @} + */ #endif // #if __cplusplus #endif // diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 0daca7a53b..f9bfb5a310 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -1913,19 +1913,18 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); /** * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra * - * @param [in[ f - * @param [in] gridDimX - * @param [in] gridDimY - * @param [in] gridDimZ - * @param [in] blockDimX - * @param [in] blockDimY - * @param [in] blockDimZ - * @param [in] sharedMemBytes - * @param [in] stream - * @param [in] kernelParams - * @param [in] extraa + * @param [in[ f Kernel to launch. + * @param [in] gridDimX X grid dimension specified as multiple of blockDimX. + * @param [in] gridDimY Y grid dimension specified as multiple of blockDimY. + * @param [in] gridDimZ Z grid dimension specified as multiple of blockDimZ. + * @param [in] blockDimX X block dimensions specified in work-items + * @param [in] blockDimY Y grid dimension specified in work-items + * @param [in] blockDimZ Z grid dimension specified in work-items + * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED. + * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. + * @param [in] kernelParams + * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. * - * The function takes the above arguments and run the kernel in hipFunction_t f. with launch parameters specified in gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY and blockDimmZ. The amount of shared memory is specificed and can be used with HIP_DYNAMIC_SHARED. The arguemt extra is used to pass in the arguments for the kernel. * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue * * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage. From 1897ca4a205aaa7a86abca27a28a91f38e8d6deb Mon Sep 17 00:00:00 2001 From: sunway513 Date: Thu, 6 Apr 2017 23:55:15 +0000 Subject: [PATCH 015/171] Refactor events and add initial event option for hipHccModuleLaunchKernel - Change hipEvent_t to a class. - Move event logic inside the class. - Add _type to support Independent, StartCommand, StopCommand events. StartCommand returns start timestamp from events. Change-Id: I4ddd694f2645a3ff7170c9111dc1d3e39931ca21 [ROCm/clr commit: f442e975c6aba2903f1f69de8b58a11149992d31] --- projects/clr/hipamd/src/hip_event.cpp | 60 +++++++++++++++++++--- projects/clr/hipamd/src/hip_hcc.cpp | 17 ------ projects/clr/hipamd/src/hip_hcc_internal.h | 31 ++++++++--- projects/clr/hipamd/src/hip_module.cpp | 32 +++++++++--- 4 files changed, 99 insertions(+), 41 deletions(-) diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index d44f201db5..61ac5cd3ab 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -30,6 +30,54 @@ THE SOFTWARE. //--- +ihipEvent_t::ihipEvent_t(unsigned flags) +{ + _state = hipEventStatusCreated; + _stream = NULL; + _flags = flags; + _timestamp = 0; + _type = hipEventTypeIndependent; +}; + + + +// Attach to an existing completion future: +void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType) +{ + _state = hipEventStatusRecording; + _marker = *cf; + _type = eventType; +} + + + +void ihipEvent_t::setTimestamp() +{ + if (_state == hipEventStatusRecorded) { + // already recorded, done: + return; + } else { + // TODO - use completion-future functions to obtain ticks and timestamps: + hsa_signal_t *sig = static_cast (_marker.get_native_handle()); + if (sig) { + if (hsa_signal_load_acquire(*sig) == 0) { + + if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) { + _timestamp = _marker.get_end_tick(); + } else if (_type == hipEventTypeStartCommand) { + _timestamp = _marker.get_begin_tick(); + } else { + assert(0); // TODO - move to debug assert + _timestamp = 0; + } + + _state = hipEventStatusRecorded; + } + } + } +} + + hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) { hipError_t e = hipSuccess; @@ -37,12 +85,8 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) // TODO-IPC - support hipEventInterprocess. unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming; if ((flags & ~supportedFlags) == 0) { - ihipEvent_t *eh = new ihipEvent_t(); + ihipEvent_t *eh = new ihipEvent_t(flags); - eh->_state = hipEventStatusCreated; - eh->_stream = NULL; - eh->_flags = flags; - eh->_timestamp = 0; *event = eh; } else { e = hipErrorInvalidValue; @@ -141,8 +185,8 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) ihipEvent_t *start_eh = start; ihipEvent_t *stop_eh = stop; - ihipSetTs(start); - ihipSetTs(stop); + start->setTimestamp(); + stop->setTimestamp(); hipError_t status = hipSuccess; *ms = 0.0f; @@ -151,7 +195,7 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) if ((start_eh->_state == hipEventStatusRecorded) && (stop_eh->_state == hipEventStatusRecorded)) { // Common case, we have good information for both events. - int64_t tickDiff = (stop_eh->_timestamp - start_eh->_timestamp); + int64_t tickDiff = (stop_eh->timestamp() - start_eh->timestamp()); uint64_t freqHz; hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 374840f91f..35a3e11e71 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -1641,23 +1641,6 @@ const char *ihipErrorString(hipError_t hip_error) }; -void ihipSetTs(hipEvent_t e) -{ - ihipEvent_t *eh = e; - if (eh->_state == hipEventStatusRecorded) { - // already recorded, done: - return; - } else { - // TODO - use completion-future functions to obtain ticks and timestamps: - hsa_signal_t *sig = static_cast (eh->_marker.get_native_handle()); - if (sig) { - if (hsa_signal_load_acquire(*sig) == 0) { - eh->_timestamp = eh->_marker.get_end_tick(); - eh->_state = hipEventStatusRecorded; - } - } - } -} // Returns true if copyEngineCtx can see the memory allocated on dstCtx and srcCtx. diff --git a/projects/clr/hipamd/src/hip_hcc_internal.h b/projects/clr/hipamd/src/hip_hcc_internal.h index 4b960e2820..459ea3ba2c 100644 --- a/projects/clr/hipamd/src/hip_hcc_internal.h +++ b/projects/clr/hipamd/src/hip_hcc_internal.h @@ -584,22 +584,39 @@ private: // Data //---- // Internal event structure: enum hipEventStatus_t { - hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use. - hipEventStatusCreated = 1, - hipEventStatusRecording = 2, // event has been enqueued to record something. - hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid. + hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use. + hipEventStatusCreated = 1, + hipEventStatusRecording = 2, // event has been enqueued to record something. + hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid. } ; +// TODO - rename to ihip type of some kind +enum ihipEventType_t { + hipEventTypeIndependent, + hipEventTypeStartCommand, + hipEventTypeStopCommand, +}; // internal hip event structure. -struct ihipEvent_t { - hipEventStatus_t _state; +class ihipEvent_t { +public: + ihipEvent_t(unsigned flags); + void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType); + void setTimestamp(); + uint64_t timestamp() const { return _timestamp; } ; + +public: + hipEventStatus_t _state; hipStream_t _stream; // Stream where the event is recorded, or NULL if all streams. unsigned _flags; hc::completion_future _marker; + +private: + ihipEventType_t _type; uint64_t _timestamp; // store timestamp, may be set on host or by marker. +friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream); } ; @@ -822,8 +839,6 @@ extern hipError_t ihipDeviceSetState(); extern ihipDevice_t *ihipGetDevice(int); ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex); -extern void ihipSetTs(hipEvent_t e); - hipStream_t ihipSyncAndResolveStream(hipStream_t); diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index 67bba5f935..c8555672c3 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -364,10 +364,11 @@ hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, hipError_t ihipModuleLaunchKernel(hipFunction_t f, - uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, - uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, - size_t sharedMemBytes, hipStream_t hStream, - void **kernelParams, void **extra) + uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, + uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, + size_t sharedMemBytes, hipStream_t hStream, + void **kernelParams, void **extra, + hipEvent_t *startEvent, hipEvent_t *stopEvent) { auto ctx = ihipGetTlsDefaultCtx(); @@ -446,7 +447,20 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, (HSA_FENCE_SCOPE_SYSTEM << HSA_PACKET_HEADER_RELEASE_FENCE_SCOPE); }; - lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, nullptr/*completion_future*/); + + hc::completion_future cf; + + lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, + (startEvent || stopEvent) ? &cf : nullptr); + + + if (startEvent) { + (*startEvent)->attachToCompletionFuture(&cf, hipEventTypeStartCommand); + } + if (stopEvent) { + (*stopEvent)->attachToCompletionFuture (&cf, hipEventTypeStopCommand); + } + if(kernelParams != NULL){ free(config[1]); @@ -470,7 +484,8 @@ hipError_t hipModuleLaunchKernel(hipFunction_t f, return ihipLogStatus(ihipModuleLaunchKernel(f, blockDimX * gridDimX, blockDimY * gridDimY, gridDimZ * blockDimZ, blockDimX, blockDimY, blockDimZ, - sharedMemBytes, hStream, kernelParams, extra)); + sharedMemBytes, hStream, kernelParams, extra, + nullptr, nullptr)); } @@ -478,7 +493,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, - void **kernelParams, void **extra) + void **kernelParams, void **extra, + hipEvent_t *startEvent, hipEvent_t *stopEvent) { HIP_INIT_API(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, @@ -486,7 +502,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, kernelParams, extra); return ihipLogStatus(ihipModuleLaunchKernel(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, - sharedMemBytes, hStream, kernelParams, extra)); + sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent)); } hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, From dc9e957a360498777d280886d1fff55265d6a074 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 6 Apr 2017 21:00:00 -0500 Subject: [PATCH 016/171] Finish adding start/stop event support to hipHccModuleLaunchKernel. Change interface to use hipEvent_t rather than hipEvent_t* Change-Id: I259062dc087a13d51dc27f84e1e8861f332a104d [ROCm/clr commit: e9eaadd135f57569f3c2ff2186434bb340655f35] --- projects/clr/hipamd/include/hip/hcc_detail/hip_hcc.h | 11 +++++------ projects/clr/hipamd/src/hip_hcc_internal.h | 1 + projects/clr/hipamd/src/hip_module.cpp | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_hcc.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_hcc.h index 889e04eb9f..fc04917931 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_hcc.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_hcc.h @@ -70,15 +70,14 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. * @param [in] kernelParams * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. - * @param [in] startEvent If non-null, specified event will be updated to track the start time of the kernel launch. The event must be created before calling this API. + * @param [in] startEvent If non-null, specified event will be updated to track the start time of the kernel launch. The event must be created before calling this API. * @param [in] stopEvent If non-null, specified event will be updated to track the stop time of the kernel launch. The event must be created before calling this API. * * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue - - * If startNanos or stopNanos is specified, this API will record and return the start and stop timestamps for the command. The timestamps are collected on the GPU device - * and converted into ns resolution. Typically programs will specify both pointers. Collecting performance timestamps may have a small overhead (approx 1us). * * @warning kernellParams argument is not yet implemented in HIP. Please use extra instead. Please refer to hip_porting_driver_api.md for sample usage. + + * HIP/ROCm actually updates the start event when the associated kernel completes. */ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t globalWorkSizeX, @@ -91,8 +90,8 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, hipStream_t hStream, void **kernelParams, void **extra, - uint64_t *startNanos=nullptr, - uint64_t *stopNanos=nullptr + hipEvent_t startEvent=nullptr, + hipEvent_t stopEvent=nullptr ); // doxygen end HCC-specific features diff --git a/projects/clr/hipamd/src/hip_hcc_internal.h b/projects/clr/hipamd/src/hip_hcc_internal.h index 459ea3ba2c..9c17c6e98c 100644 --- a/projects/clr/hipamd/src/hip_hcc_internal.h +++ b/projects/clr/hipamd/src/hip_hcc_internal.h @@ -604,6 +604,7 @@ public: void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType); void setTimestamp(); uint64_t timestamp() const { return _timestamp; } ; + ihipEventType_t type() const { return _type; }; public: hipEventStatus_t _state; diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index c8555672c3..b359e7a63c 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -368,7 +368,7 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, - hipEvent_t *startEvent, hipEvent_t *stopEvent) + hipEvent_t startEvent, hipEvent_t stopEvent) { auto ctx = ihipGetTlsDefaultCtx(); @@ -455,10 +455,10 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, if (startEvent) { - (*startEvent)->attachToCompletionFuture(&cf, hipEventTypeStartCommand); + startEvent->attachToCompletionFuture(&cf, hipEventTypeStartCommand); } if (stopEvent) { - (*stopEvent)->attachToCompletionFuture (&cf, hipEventTypeStopCommand); + stopEvent->attachToCompletionFuture (&cf, hipEventTypeStopCommand); } @@ -494,7 +494,7 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void **kernelParams, void **extra, - hipEvent_t *startEvent, hipEvent_t *stopEvent) + hipEvent_t startEvent, hipEvent_t stopEvent) { HIP_INIT_API(f, globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ, localWorkSizeX, localWorkSizeY, localWorkSizeZ, From cf6ccf67dfac006ddf1d78eb722e8f9f056cfdc0 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 7 Apr 2017 14:51:54 +0530 Subject: [PATCH 017/171] Fix build issues in hipCommander sample - Remove -stdlib=libstdc++ from Makefile - Removed deleted HIP header file fom includes Change-Id: Ia189396bee19fc52b679259df56c6c6e2bafb6fe [ROCm/clr commit: 6877554f5e0011adfe9e37d10a0b5a5958928643] --- projects/clr/hipamd/samples/1_Utils/hipCommander/Makefile | 3 --- .../clr/hipamd/samples/1_Utils/hipCommander/hipCommander.cpp | 1 - 2 files changed, 4 deletions(-) diff --git a/projects/clr/hipamd/samples/1_Utils/hipCommander/Makefile b/projects/clr/hipamd/samples/1_Utils/hipCommander/Makefile index e770c636a4..a411763b7f 100644 --- a/projects/clr/hipamd/samples/1_Utils/hipCommander/Makefile +++ b/projects/clr/hipamd/samples/1_Utils/hipCommander/Makefile @@ -10,9 +10,6 @@ OPT=-O3 CXXFLAGS = $(OPT) --std=c++11 HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --platform) -ifeq (${HIP_PLATFORM}, hcc) - CXXFLAGS += " -stdlib=libc++" -endif CODE_OBJECTS=nullkernel.hsaco diff --git a/projects/clr/hipamd/samples/1_Utils/hipCommander/hipCommander.cpp b/projects/clr/hipamd/samples/1_Utils/hipCommander/hipCommander.cpp index 0add1ce3e3..4b93180b18 100644 --- a/projects/clr/hipamd/samples/1_Utils/hipCommander/hipCommander.cpp +++ b/projects/clr/hipamd/samples/1_Utils/hipCommander/hipCommander.cpp @@ -11,7 +11,6 @@ #include #include #include -#include #endif #include From 3176273cd62c199d105a89ee2ffc5c24b02170b4 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 7 Apr 2017 15:24:10 +0530 Subject: [PATCH 018/171] Fix build issues with bit_extract sample Change-Id: I628b3c83a16f7adf0ab8ca60aecde8c073c34fd9 [ROCm/clr commit: cbb7f12b1de18c1d291e4086afcbce66e395b35b] --- projects/clr/hipamd/samples/0_Intro/bit_extract/Makefile | 4 ---- 1 file changed, 4 deletions(-) diff --git a/projects/clr/hipamd/samples/0_Intro/bit_extract/Makefile b/projects/clr/hipamd/samples/0_Intro/bit_extract/Makefile index 78f6a2faa8..08bca6e642 100644 --- a/projects/clr/hipamd/samples/0_Intro/bit_extract/Makefile +++ b/projects/clr/hipamd/samples/0_Intro/bit_extract/Makefile @@ -11,10 +11,6 @@ HIPCC=$(HIP_PATH)/bin/hipcc ifeq (${HIP_PLATFORM}, nvcc) HIPCC_FLAGS = -gencode=arch=compute_20,code=sm_20 endif -ifeq (${HIP_PLATFORM}, hcc) - HIPCC_FLAGS = -stdlib=libc++ -endif - EXE=bit_extract From 4dcbf609b77bea6d53a3b2f020017285c4523aaf Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Fri, 7 Apr 2017 15:38:56 +0530 Subject: [PATCH 019/171] Updated table of contents in markdown documentation Change-Id: I7347a06f57f9927ca3fcc5590a6c8200bc1bb1f5 [ROCm/clr commit: cfd4620f3671ca3221f31877e93ec71a45c19ce2] --- projects/clr/hipamd/docs/markdown/hip_bugs.md | 6 +-- projects/clr/hipamd/docs/markdown/hip_faq.md | 9 ++-- .../docs/markdown/hip_kernel_language.md | 1 + .../hipamd/docs/markdown/hip_porting_guide.md | 9 ++-- .../clr/hipamd/docs/markdown/hip_profiling.md | 46 +++++++++++-------- 5 files changed, 41 insertions(+), 30 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_bugs.md b/projects/clr/hipamd/docs/markdown/hip_bugs.md index 14f2935f17..73133843bc 100644 --- a/projects/clr/hipamd/docs/markdown/hip_bugs.md +++ b/projects/clr/hipamd/docs/markdown/hip_bugs.md @@ -1,10 +1,10 @@ -# HIP Bugs +# HIP Bugs -- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**](#errors-related-to-undefined-reference-to-hclaunchkernel__grid_launch_parm) -- [Application hangs after a hipLaunchKernel call](#what-if-i-see-application-hangs-after-a-hiplaunchkernel-call) +- [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`](#errors-related-to-undefined-reference-to-__hclaunchkernel____grid_launch_parm) - [What is the current limitation of HIP Generic Grid Launch method?](#what-is-the-current-limitation-of-hip-generic-grid-launch-method) +- [Errors related to `no matching constructor`](#errors-related-to-no-matching-constructor) - [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions) diff --git a/projects/clr/hipamd/docs/markdown/hip_faq.md b/projects/clr/hipamd/docs/markdown/hip_faq.md index 8ccb458103..e316d449ef 100644 --- a/projects/clr/hipamd/docs/markdown/hip_faq.md +++ b/projects/clr/hipamd/docs/markdown/hip_faq.md @@ -4,7 +4,7 @@ - [What APIs and features does HIP support?](#what-apis-and-features-does-hip-support) - [What is not supported?](#what-is-not-supported) - * [Run-time features](#run-time-features) + * [Runtime/Driver API features](#runtimedriver-api-features) * [Kernel language features](#kernel-language-features) - [Is HIP a drop-in replacement for CUDA?](#is-hip-a-drop-in-replacement-for-cuda) - [What specific version of CUDA does HIP support?](#what-specific-version-of-cuda-does-hip-support) @@ -23,10 +23,11 @@ - [On HCC, can I link HIP code with host code compiled with another compiler such as gcc, icc, or clang ?](#on-hcc-can-i-link-hip-code-with-host-code-compiled-with-another-compiler-such-as-gcc-icc-or-clang-) - [HIP detected my platform (hcc vs nvcc) incorrectly - what should I do?](#hip-detected-my-platform-hcc-vs-nvcc-incorrectly---what-should-i-do) - [Can I install both CUDA SDK and HCC on same machine?](#can-i-install-both-cuda-sdk-and-hcc-on-same-machine) +- [On CUDA, can I mix CUDA code with HIP code?](#on-cuda-can-i-mix-cuda-code-with-hip-code) +- [On HCC, can I use HC functionality with HIP?](#on-hcc-can-i-use-hc-functionality-with-hip) - [How do I trace HIP application flow?](#how-do-i-trace-hip-application-flow) - * [Using CodeXL markers for HIP Functions](#using-codexl-markers-for-hip-functions) - * [Using HIP_TRACE_API](#using-hip_trace_api) -- [How do I enable HIP Generic Grid Launch option?](#how-do-i-enable-hip-generic-grid-launch-option) +- [What if HIP generates error of "symbol multiply defined!" only on AMD machine?](#what-if-hip-generates-error-of-symbol-multiply-defined-only-on-amd-machine) +- [How do I disable HIP Generic Grid Launch option?](#how-do-i-disable-hip-generic-grid-launch-option) diff --git a/projects/clr/hipamd/docs/markdown/hip_kernel_language.md b/projects/clr/hipamd/docs/markdown/hip_kernel_language.md index 0c7f3c8d25..3cb7b17a0c 100644 --- a/projects/clr/hipamd/docs/markdown/hip_kernel_language.md +++ b/projects/clr/hipamd/docs/markdown/hip_kernel_language.md @@ -44,6 +44,7 @@ - [Pragma Unroll](#pragma-unroll) - [In-Line Assembly](#in-line-assembly) - [C++ Support](#c-support) +- [Kernel Compilation](#kernel-compilation) diff --git a/projects/clr/hipamd/docs/markdown/hip_porting_guide.md b/projects/clr/hipamd/docs/markdown/hip_porting_guide.md index 9f20d12423..72f6384f6d 100644 --- a/projects/clr/hipamd/docs/markdown/hip_porting_guide.md +++ b/projects/clr/hipamd/docs/markdown/hip_porting_guide.md @@ -21,6 +21,7 @@ and provides practical suggestions on how to port CUDA code and work through com * [Device-Architecture Properties](#device-architecture-properties) * [Table of Architecture Properties](#table-of-architecture-properties) - [Finding HIP](#finding-hip) +- [hipLaunchKernel](#hiplaunchkernel) - [Compiler Options](#compiler-options) - [Linking Issues](#linking-issues) * [Linking With hipcc](#linking-with-hipcc) @@ -31,9 +32,11 @@ and provides practical suggestions on how to port CUDA code and work through com * [Using a Standard C++ Compiler](#using-a-standard-c-compiler) + [cuda.h](#cudah) * [Choosing HIP File Extensions](#choosing-hip-file-extensions) - * [Workarounds](#workarounds) - + [warpSize](#warpsize) - + [Textures and Cache Control](#textures-and-cache-control) +- [Workarounds](#workarounds) + * [warpSize](#warpsize) +- [memcpyToSymbol](#memcpytosymbol) +- [threadfence_system](#threadfence_system) + * [Textures and Cache Control](#textures-and-cache-control) - [More Tips](#more-tips) * [HIPTRACE Mode](#hiptrace-mode) * [Environment Variables](#environment-variables) diff --git a/projects/clr/hipamd/docs/markdown/hip_profiling.md b/projects/clr/hipamd/docs/markdown/hip_profiling.md index 463c9c13b3..6e5cde700d 100644 --- a/projects/clr/hipamd/docs/markdown/hip_profiling.md +++ b/projects/clr/hipamd/docs/markdown/hip_profiling.md @@ -4,26 +4,32 @@ This section describes the profiling and debugging capabilities that HIP provide Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs. This document starts with some of the general capabilities of CodeXL and then describes some of the additional HIP marker and debug features. - * [CodeXL Profiling](#codexl-profiling) - * [Collecting and Viewing Traces](#collecting-and-viewing-traces) - * [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling) - * [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection) - * [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results) - * [More information on CodeXL](#more-information-on-codexl) - * [HIP Markers](#hip-markers) - * [Profiling HIP APIs](#profiling-hip-apis) - * [Adding markers to applications](#adding-markers-to-applications) - * [Additional HIP Profiling Features](#additional-hip-profiling-features) - * [Demangling C Kernel Names](#demangling-c-kernel-names) - * [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends) - * [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size) - * [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time) - * [Tracing and Debug](#tracing-and-debug) - * [Tracing HIP APIs](#tracing-hip-apis) - * [Color](#color) - * [Using HIP_DB](#using-hip_db) - * [Using ltrace](#using-ltrace) - * [Chicken bits](#chicken-bits) + + +- [CodeXL Profiling](#codexl-profiling) + * [Collecting and Viewing Traces](#collecting-and-viewing-traces) + + [Using rocm-profiler timestamp profiling](#using-rocm-profiler-timestamp-profiling) + + [Using rocm-profiler performance counter collection:](#using-rocm-profiler-performance-counter-collection) + + [Using CodeXL to view profiling results:](#using-codexl-to-view-profiling-results) + + [More information on CodeXL](#more-information-on-codexl) + * [HIP Markers](#hip-markers) + + [Profiling HIP APIs](#profiling-hip-apis) + + [Adding markers to applications](#adding-markers-to-applications) + * [Additional HIP Profiling Features](#additional-hip-profiling-features) + + [Demangling C++ Kernel Names](#demangling-c-kernel-names) + + [Controlling when profiling starts and ends](#controlling-when-profiling-starts-and-ends) + + [Reducing timeline trace output file size](#reducing-timeline-trace-output-file-size) + + [How to enable profiling at HIP build time](#how-to-enable-profiling-at-hip-build-time) +- [Tracing and Debug](#tracing-and-debug) + * [Tracing HIP APIs](#tracing-hip-apis) + + [Color](#color) + * [Using HIP_DB](#using-hip_db) + * [Using ltrace](#using-ltrace) + * [Chicken bits](#chicken-bits) + * [Debugging HIP Applications](#debugging-hip-applications) + * [General Debugging Tips](#general-debugging-tips) + + ## CodeXL Profiling From 9fd6c7179d55bbf55cecb8f69f9b27a294974382 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Fri, 7 Apr 2017 14:06:31 -0500 Subject: [PATCH 020/171] update GGL to log launched kernel information Change-Id: Ied0aa6055673c687071b4a579aecd17f0f3f09ce [ROCm/clr commit: f699e027150265e8108bc9135f9a1e1d81183fa1] --- .../hip/hcc_detail/grid_launch_GGL.hpp | 33 ++-- .../hipamd/include/hip/hcc_detail/helpers.hpp | 150 ++++++++---------- projects/clr/hipamd/src/grid_launch.cpp | 36 +++++ 3 files changed, 126 insertions(+), 93 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index 8f1abbb70b..2dd9a95bc6 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -21,6 +21,7 @@ THE SOFTWARE. */ #pragma once + #if GENERIC_GRID_LAUNCH == 1 #include "concepts.hpp" @@ -71,7 +72,7 @@ namespace hip_impl template using is_new_grid_launch_t = typename std::conditional< - std::is_callable{}, + is_callable{}, New_grid_launch_tag, Old_grid_launch_tag>::type; } @@ -118,6 +119,7 @@ namespace hip_impl // TODO: these are workarounds, they should be removed. hc::accelerator_view lock_stream_hip_(hipStream_t&, void*&); + void print_prelaunch_trace_(const char*, dim3, dim3, int, hipStream_t); void unlock_stream_hip_( hipStream_t, void*, const char*, hc::accelerator_view*); @@ -137,7 +139,13 @@ namespace hip_impl void* lck_stream = nullptr; auto acc_v = lock_stream_hip_(stream, lck_stream); auto stream_guard = make_RAII_guard( - [](){ /* perhaps use a slimmed down ihipPrintKernelLaunch here */ }, + std::bind( + print_prelaunch_trace_, + kernel_name, + num_blocks, + dim_blocks, + group_mem_bytes, + stream), std::bind( unlock_stream_hip_, stream, lck_stream, kernel_name, &acc_v)); @@ -841,16 +849,15 @@ namespace hip_impl group_mem_bytes,\ stream,\ ...)\ - do {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } while(0) - + do {\ + hipLaunchKernelGGL(\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + hipLaunchParm{},\ + ##__VA_ARGS__);\ + } while(0) } #endif //GENERIC_GRID_LAUNCH diff --git a/projects/clr/hipamd/include/hip/hcc_detail/helpers.hpp b/projects/clr/hipamd/include/hip/hcc_detail/helpers.hpp index e5a84a4678..611929766b 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/helpers.hpp +++ b/projects/clr/hipamd/include/hip/hcc_detail/helpers.hpp @@ -21,6 +21,7 @@ THE SOFTWARE. */ #pragma once +#include "concepts.hpp" #include // For std::conditional, std::decay, std::enable_if, // std::false_type, std result_of and std::true_type. @@ -29,9 +30,6 @@ THE SOFTWARE. namespace std { // TODO: these should be removed as soon as possible. #if (__cplusplus < 201406L) - template - using void_t = void; - #if (__cplusplus < 201402L) template using enable_if_t = typename enable_if::type; @@ -43,88 +41,80 @@ namespace std using result_of_t = typename result_of::type; template using remove_reference_t = typename remove_reference::type; - template< - FunctionalProcedure F, - unsigned int n = 0u, - typename = void> - struct is_callable_impl : is_callable_impl {}; - - // Pointer to member function, call through non-pointer. - template - struct is_callable_impl< - F(C, Ts...), - 0u, - void_t().*declval())(declval()...))> - > : true_type { - }; - - // Pointer to member function, call through pointer. - template - struct is_callable_impl< - F(C, Ts...), - 1u, - void_t()).*declval())(declval()...))> - > : std::true_type { - }; - - // Pointer to member data, call through non-pointer, no args. - template - struct is_callable_impl< - F(C), - 2u, - void_t().*declval())> - > : true_type { - }; - - // Pointer to member data, call through pointer, no args. - template - struct is_callable_impl< - F(C), - 3u, - void_t().*declval())> - > : true_type { - }; - - // General call, n args. - template - struct is_callable_impl< - F(Ts...), - 4u, - void_t()(declval()...))> - > : true_type { - }; - - // Not callable. - template - struct is_callable_impl : false_type {}; - - template - struct is_callable : is_callable_impl {}; - #else - template - struct is_callable_impl : false_type {}; - - template - struct is_callable_impl< - F(Ts...), - void_t>> : true_type {}; - - template - struct is_callable : is_callable_impl {}; #endif - template - struct disjunction : false_type {}; - template - struct disjunction : B1 {}; - template - struct disjunction - : conditional_t> - {}; #endif } -namespace hip_impl // Only for documentation, macros ignore namespaces. +namespace hip_impl { + template + using void_t_ = void; + + #if (__cplusplus < 201402L) + template< + FunctionalProcedure F, + unsigned int n = 0u, + typename = void> + struct is_callable_impl : is_callable_impl {}; + + // Pointer to member function, call through non-pointer. + template + struct is_callable_impl< + F(C, Ts...), + 0u, + void_t_().*std::declval())( + std::declval()...))> + > : std::true_type {}; + + // Pointer to member function, call through pointer. + template + struct is_callable_impl< + F(C, Ts...), + 1u, + void_t_()).*std::declval())( + std::declval()...))> + > : std::true_type {}; + + // Pointer to member data, call through non-pointer, no args. + template + struct is_callable_impl< + F(C), + 2u, + void_t_().*std::declval())> + > : std::true_type {}; + + // Pointer to member data, call through pointer, no args. + template + struct is_callable_impl< + F(C), + 3u, + void_t_().*std::declval())> + > : std::true_type {}; + + // General call, n args. + template + struct is_callable_impl< + F(Ts...), + 4u, + void_t_()(std::declval()...))> + > : std::true_type {}; + + // Not callable. + template + struct is_callable_impl : std::false_type {}; + + template + struct is_callable : is_callable_impl {}; + #else + template + struct is_callable_impl : std::false_type {}; + + template + struct is_callable_impl< + F(Ts...), + void_t_>> : std::true_type {}; + #endif + #define count_macro_args_impl_hip_(\ _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15,\ _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29,\ diff --git a/projects/clr/hipamd/src/grid_launch.cpp b/projects/clr/hipamd/src/grid_launch.cpp index 7739995600..cac01df7dc 100644 --- a/projects/clr/hipamd/src/grid_launch.cpp +++ b/projects/clr/hipamd/src/grid_launch.cpp @@ -27,6 +27,9 @@ THE SOFTWARE. #include "hc.hpp" #include "trace_helper.h" +#include +#include + namespace hip_impl { hc::accelerator_view lock_stream_hip_( @@ -42,6 +45,39 @@ namespace hip_impl return (*static_cast(locked_stream))->_av; } + void print_prelaunch_trace_( + const char* kernel_name, + dim3 num_blocks, + dim3 dim_blocks, + int group_mem_bytes, + hipStream_t stream) + { + if ((HIP_TRACE_API & (1 << TRACE_CMD)) || + HIP_PROFILE_API || + (COMPILE_HIP_DB && HIP_TRACE_API)) { + std::stringstream os; + os << tls_tidInfo.tid() << "." << tls_tidInfo.apiSeqNum() + << " hipLaunchKernel '" << kernel_name << "'" + << " gridDim:" << num_blocks + << " groupDim:" << dim_blocks + << " sharedMem:+" << group_mem_bytes + << " " << *stream; + + if (HIP_PROFILE_API == 0x1) { + std::string shortAtpString("hipLaunchKernel:"); + shortAtpString += kernel_name; + MARKER_BEGIN(shortAtpString.c_str(), "HIP"); + } else if (HIP_PROFILE_API == 0x2) { + MARKER_BEGIN(os.str().c_str(), "HIP"); + } + + if (COMPILE_HIP_DB && HIP_TRACE_API) { + std::cerr << API_COLOR << os.str() << API_COLOR_END + << std::endl; + } + } + } + void unlock_stream_hip_( hipStream_t stream, void* locked_stream, From 7cedda499643486b21e19d937f32d4ebe9726b4c Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Fri, 7 Apr 2017 14:17:41 -0500 Subject: [PATCH 021/171] Add more operator overloading for float2 type, contributed by Aditya Change-Id: If1ab7fb24d64bb5304142aed0951c9bd5ad47d20 [ROCm/clr commit: 19987ede228add81501bbe346a133950e5b3edcd] --- .../include/hip/hcc_detail/hip_vector_types.h | 38 ++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h index 82bd3b2d6f..35c6c23548 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h @@ -1270,6 +1270,15 @@ __device__ __host__ static inline type operator op (type& val, int) { \ #define DECLOP_1VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ return lhs.x op rhs.x; \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return lhs.x op rhs.x; \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return lhs.x op rhs.x ; \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return lhs.x op rhs.x ; \ } #define DECLOP_1VAR_1IN_1OUT(type, op) \ @@ -1338,6 +1347,15 @@ __device__ __host__ static inline type operator op (type& val, int) { \ #define DECLOP_2VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ } #define DECLOP_2VAR_1IN_1OUT(type, op) \ @@ -1415,7 +1433,16 @@ __device__ __host__ static inline type operator op (type& val, int) { \ #define DECLOP_3VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ #define DECLOP_3VAR_1IN_1OUT(type, op) \ __device__ __host__ static inline type operator op(type &rhs) { \ @@ -1500,6 +1527,15 @@ __device__ __host__ static inline type operator op (type& val, int) { \ #define DECLOP_4VAR_COMP(type, op) \ __device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ } #define DECLOP_4VAR_1IN_1OUT(type, op) \ From 4cd6fb07d58f6895cf6d09106e43056ed6489330 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Fri, 7 Apr 2017 16:29:25 -0500 Subject: [PATCH 022/171] Update the define of __global__ for GGL Change-Id: I563bb2a132403bcbe9e9f279b55406cf0255af7d [ROCm/clr commit: 9884f2e83f241cbe45646006ffa264bda06609a0] --- projects/clr/hipamd/include/hip/hcc_detail/host_defines.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h b/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h index b0a7421d18..5864cfa0e7 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h @@ -48,7 +48,7 @@ THE SOFTWARE. #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else //#warning "GGL global define reached" -#define __global__ [[hc]] __attribute__((weak)) +#define __global__ __attribute__((hc, weak)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) From 588dba559c502f59df8f9f64af0c41526bceac19 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Mon, 10 Apr 2017 08:53:12 -0500 Subject: [PATCH 023/171] add math.h to cover sqrtf function Change-Id: Ia37752710cea4ca77e0a4e61f8e69a0355d9488d [ROCm/clr commit: bfa61eac2c2db89eb60627afcc539a205a493efa] --- projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h | 1 + 1 file changed, 1 insertion(+) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h index 26d73a21a8..c76d65b058 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_complex.h @@ -24,6 +24,7 @@ THE SOFTWARE. #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_COMPLEX_H #include "hip/hcc_detail/hip_vector_types.h" +#include #if __cplusplus #define COMPLEX_ADD_OP_OVERLOAD(type) \ From a5e8222af848ba1caeb7704ba9e8998f6936be80 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Mon, 10 Apr 2017 11:17:05 -0500 Subject: [PATCH 024/171] Fix ifndef guard in hip_fp16.h Change-Id: I0215556e7aa98a74e8a984e4de3fb6e8cafdfb24 [ROCm/clr commit: 3c04722d10ee7efe0545afbcc24f3b122f78b95b] --- projects/clr/hipamd/include/hip/hip_fp16.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/include/hip/hip_fp16.h b/projects/clr/hipamd/include/hip/hip_fp16.h index 0e002d9396..95879dba50 100644 --- a/projects/clr/hipamd/include/hip/hip_fp16.h +++ b/projects/clr/hipamd/include/hip/hip_fp16.h @@ -20,7 +20,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ -#ifdef HIP_INCLUDE_HIP_HIP_FP16_H +#ifndef HIP_INCLUDE_HIP_HIP_FP16_H #define HIP_INCLUDE_HIP_HIP_FP16_H #include From 0d24c155a3792fbed6799908e40b1543be2d90d9 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 9 Apr 2017 20:51:56 -0500 Subject: [PATCH 025/171] Doc update for Serialization. Describe workaround for partial specialization [ROCm/clr commit: 29af40fa76c44b6cba83db7ff12d2288bf51e499] --- projects/clr/hipamd/docs/markdown/hip_bugs.md | 81 ++++++++++++------- 1 file changed, 53 insertions(+), 28 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_bugs.md b/projects/clr/hipamd/docs/markdown/hip_bugs.md index 73133843bc..9452fae2fd 100644 --- a/projects/clr/hipamd/docs/markdown/hip_bugs.md +++ b/projects/clr/hipamd/docs/markdown/hip_bugs.md @@ -1,5 +1,4 @@ -# HIP Bugs - +# HIP Bugs - [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`](#errors-related-to-undefined-reference-to-__hclaunchkernel____grid_launch_parm) @@ -41,60 +40,86 @@ For example, `Foo` in the code snippets below contains an array-typed member var ``` struct Foo { + float _data; // table is an array, which makes foo int table[3]; }; ``` -An workaround is to provide a custom serializer on CPU side, and append the contents of the array as kernel arguments: +A workaround is to provide a custom serializer on host side which appends the contents of the array as kernel arguments, and a custome deserializaer on the device path to reconstruct the array inside the GPU kernels. +The deserializer can not be a function template, and should have scalar-typed parameters of the number equals to the length of the array-typed member variable. For example: ``` struct Foo { - int table[3]; + float _data; + int _table[3]; + - // user-provided CPU serializer - // must append the contents of the array member as kernel arguments #ifdef __HCC__ + // user-provided CPU serializer + // Append the contents of the array member as kernel arguments __attribute__((annotate(“serialize”))) void __cxxamp_serialize(Kalmar::Serialize &s) const { + s.Append(sizeof(float), &_data); for (int i = 0; i < 3; ++i) - s.Append(sizeof(int), &table[i]); + s.Append(sizeof(int), &_table[i]); } -#endif -}; -``` -Then, provide a custom deserializer on GPU side, to help reconstruct the array within GPU kernels. Notice that the deserializer can not be a function template, and should have scalar-typed parameters of the number equals to the length of the array-typed member variable. For example: - -``` -struct Foo { - int table[3]; // user-provided GPU deserializer // table has 3 int elements, so deserializer must have 3 int parameters. -#ifdef __HCC__ __attribute__((annotate(“user_deserialize”))) - Foo(int x0, int x1, int x2) [[cpu]][[hc]] { - table[0] = x0; - table[1] = x1; - table[2] = x2; + Foo(float d, int x0, int x1, int x2) [[cpu]][[hc]] { + _data = d; + _table[0] = x0; + _table[1] = x1; + _table[2] = x2; } -#endif -#ifdef __HCC__ - __attribute__((annotate(“serialize”))) - void __cxxamp_serialize(Kalmar::Serialize &s) const { - s.Append(sizeof(int), &table[0]); - s.Append(sizeof(int), &table[1]); - s.Append(sizeof(int), &table[2]); - } #endif }; ``` Rather than create serializer functions, another workaround is to pass the member fields from the structure as simple data types. +Note a class or struct can contain only one "user_deserialize" constructor. +For types which contain arrays which are based on template parameter, you can use partial template instantiation to implement one constructor per specialization. +However, an easier approach may be to create one user_deserializer which processes the maximum supported dimension. +This will take more memory in the structure and also require additional kernel arguments, but this may have little performance impact and the conversion is easier than partial template specialization. An example: + +``` +#define MAX_Dim 4 +template struct MyArray { + + T* dataPtr_; + //int size_[Dim]; // Original code with template-sized Dims + int size_[MAX_dim]; // Workaround code - allocate an array big enough for all dims so one serializer works. + + +... + +#ifdef __HCC__ + __attribute__((annotate("serialize"))) + void __cxxamp_serialize(Kalmar::Serialize &s) const { + s.Append(sizeof(float), &_dataPtr); + for (int i=0; i Date: Tue, 11 Apr 2017 01:16:28 +0000 Subject: [PATCH 026/171] Add integer abs (initial implementation, can be optimized with OCML) Change-Id: I1f568c8c0e2333af1fda4c313dc48ea0c5b6ab00 [ROCm/clr commit: 8bd34535b4422c36c40bc8f51e54c1e7a20805ab] --- projects/clr/hipamd/include/hip/hcc_detail/math_functions.h | 1 + projects/clr/hipamd/src/math_functions.cpp | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h b/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h index c3b8186fd3..9faff2743a 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/math_functions.h @@ -51,6 +51,7 @@ __device__ float exp10f(float x); __device__ float exp2f(float x); __device__ float expf(float x); __device__ float expm1f(float x); +__device__ int abs(int x); __device__ float fabsf(float x); __device__ float fdimf(float x, float y); __device__ float fdividef(float x, float y); diff --git a/projects/clr/hipamd/src/math_functions.cpp b/projects/clr/hipamd/src/math_functions.cpp index 92cc8689fc..3472216309 100644 --- a/projects/clr/hipamd/src/math_functions.cpp +++ b/projects/clr/hipamd/src/math_functions.cpp @@ -114,6 +114,10 @@ __device__ float expm1f(float x) { return hc::precise_math::expm1f(x); } +__device__ int abs(int x) +{ + return x >= 0 ? x : -x; // TODO - optimize with OCML +} __device__ float fabsf(float x) { return hc::precise_math::fabsf(x); From 4ff62e9760eaa487fbf26bd6763d5c8f9d1169f6 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 13 Apr 2017 10:34:33 +0530 Subject: [PATCH 027/171] Add hip-config.cmake to hip_hcc package Due to the way hip packages are generated, for the purpose of packaging hip-targets*.cmake are not generated at build time. However hip-config*.cmake are generated at build time. This will be fixed in future. Change-Id: I5d79bc58a4f7a324ae06457130d8372ffe403830 [ROCm/clr commit: 0d4f1c2d0c81aa997cca265f4955e501670ae32c] --- .../packaging/hip-targets-release.cmake | 41 +++++++ .../clr/hipamd/packaging/hip-targets.cmake | 102 ++++++++++++++++++ projects/clr/hipamd/packaging/hip_hcc.txt | 2 + 3 files changed, 145 insertions(+) create mode 100644 projects/clr/hipamd/packaging/hip-targets-release.cmake create mode 100644 projects/clr/hipamd/packaging/hip-targets.cmake diff --git a/projects/clr/hipamd/packaging/hip-targets-release.cmake b/projects/clr/hipamd/packaging/hip-targets-release.cmake new file mode 100644 index 0000000000..ba0a5005f5 --- /dev/null +++ b/projects/clr/hipamd/packaging/hip-targets-release.cmake @@ -0,0 +1,41 @@ +#---------------------------------------------------------------- +# Generated CMake target import file for configuration "Release". +#---------------------------------------------------------------- + +# Commands may need to know the format version. +set(CMAKE_IMPORT_FILE_VERSION 1) + +# Import target "hip::hip_hcc_static" for configuration "Release" +set_property(TARGET hip::hip_hcc_static APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) +set_target_properties(hip::hip_hcc_static PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX" + IMPORTED_LINK_INTERFACE_LIBRARIES_RELEASE "hc_am" + IMPORTED_LOCATION_RELEASE "/opt/rocm/hip/lib/libhip_hcc_static.a" + ) + +list(APPEND _IMPORT_CHECK_TARGETS hip::hip_hcc_static ) +list(APPEND _IMPORT_CHECK_FILES_FOR_hip::hip_hcc_static "/opt/rocm/hip/lib/libhip_hcc_static.a" ) + +# Import target "hip::hip_hcc" for configuration "Release" +set_property(TARGET hip::hip_hcc APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) +set_target_properties(hip::hip_hcc PROPERTIES + IMPORTED_LINK_INTERFACE_LIBRARIES_RELEASE "hcc::hccrt;hcc::hc_am" + IMPORTED_LOCATION_RELEASE "/opt/rocm/hip/lib/libhip_hcc.so" + IMPORTED_SONAME_RELEASE "libhip_hcc.so" + ) + +list(APPEND _IMPORT_CHECK_TARGETS hip::hip_hcc ) +list(APPEND _IMPORT_CHECK_FILES_FOR_hip::hip_hcc "/opt/rocm/hip/lib/libhip_hcc.so" ) + +# Import target "hip::hip_device" for configuration "Release" +set_property(TARGET hip::hip_device APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) +set_target_properties(hip::hip_device PROPERTIES + IMPORTED_LINK_INTERFACE_LANGUAGES_RELEASE "CXX" + IMPORTED_LOCATION_RELEASE "/opt/rocm/hip/lib/libhip_device.a" + ) + +list(APPEND _IMPORT_CHECK_TARGETS hip::hip_device ) +list(APPEND _IMPORT_CHECK_FILES_FOR_hip::hip_device "/opt/rocm/hip/lib/libhip_device.a" ) + +# Commands beyond this point should not need to know the version. +set(CMAKE_IMPORT_FILE_VERSION) diff --git a/projects/clr/hipamd/packaging/hip-targets.cmake b/projects/clr/hipamd/packaging/hip-targets.cmake new file mode 100644 index 0000000000..65370eec9e --- /dev/null +++ b/projects/clr/hipamd/packaging/hip-targets.cmake @@ -0,0 +1,102 @@ +# Generated by CMake 3.5.1 + +if("${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION}" LESS 2.5) + message(FATAL_ERROR "CMake >= 2.6.0 required") +endif() +cmake_policy(PUSH) +cmake_policy(VERSION 2.6) +#---------------------------------------------------------------- +# Generated CMake target import file. +#---------------------------------------------------------------- + +# Commands may need to know the format version. +set(CMAKE_IMPORT_FILE_VERSION 1) + +# Protect against multiple inclusion, which would fail when already imported targets are added once more. +set(_targetsDefined) +set(_targetsNotDefined) +set(_expectedTargets) +foreach(_expectedTarget hip::hip_hcc_static hip::hip_hcc hip::hip_device) + list(APPEND _expectedTargets ${_expectedTarget}) + if(NOT TARGET ${_expectedTarget}) + list(APPEND _targetsNotDefined ${_expectedTarget}) + endif() + if(TARGET ${_expectedTarget}) + list(APPEND _targetsDefined ${_expectedTarget}) + endif() +endforeach() +if("${_targetsDefined}" STREQUAL "${_expectedTargets}") + set(CMAKE_IMPORT_FILE_VERSION) + cmake_policy(POP) + return() +endif() +if(NOT "${_targetsDefined}" STREQUAL "") + message(FATAL_ERROR "Some (but not all) targets in this export set were already defined.\nTargets Defined: ${_targetsDefined}\nTargets not yet defined: ${_targetsNotDefined}\n") +endif() +unset(_targetsDefined) +unset(_targetsNotDefined) +unset(_expectedTargets) + + +# The installation prefix configured by this project. +set(_IMPORT_PREFIX "/opt/rocm/hip") + +# Create imported target hip::hip_hcc_static +add_library(hip::hip_hcc_static STATIC IMPORTED) + +set_target_properties(hip::hip_hcc_static PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" +) + +# Create imported target hip::hip_hcc +add_library(hip::hip_hcc SHARED IMPORTED) + +set_target_properties(hip::hip_hcc PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" +) + +# Create imported target hip::hip_device +add_library(hip::hip_device STATIC IMPORTED) + +set_target_properties(hip::hip_device PROPERTIES + INTERFACE_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" + INTERFACE_SYSTEM_INCLUDE_DIRECTORIES "${_IMPORT_PREFIX}/include;/opt/rocm/hsa/include" +) + +# Load information for each installed configuration. +get_filename_component(_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) +file(GLOB CONFIG_FILES "${_DIR}/hip-targets-*.cmake") +foreach(f ${CONFIG_FILES}) + include(${f}) +endforeach() + +# Cleanup temporary variables. +set(_IMPORT_PREFIX) + +# Loop over all imported files and verify that they actually exist +foreach(target ${_IMPORT_CHECK_TARGETS} ) + foreach(file ${_IMPORT_CHECK_FILES_FOR_${target}} ) + if(NOT EXISTS "${file}" ) + message(FATAL_ERROR "The imported target \"${target}\" references the file + \"${file}\" +but this file does not exist. Possible reasons include: +* The file was deleted, renamed, or moved to another location. +* An install or uninstall procedure did not complete successfully. +* The installation package was faulty and contained + \"${CMAKE_CURRENT_LIST_FILE}\" +but not all the files it references. +") + endif() + endforeach() + unset(_IMPORT_CHECK_FILES_FOR_${target}) +endforeach() +unset(_IMPORT_CHECK_TARGETS) + +# This file does not depend on other imported targets which have +# been exported from the same project but in a separate export set. + +# Commands beyond this point should not need to know the version. +set(CMAKE_IMPORT_FILE_VERSION) +cmake_policy(POP) diff --git a/projects/clr/hipamd/packaging/hip_hcc.txt b/projects/clr/hipamd/packaging/hip_hcc.txt index 7dd65033fd..7118c32eb9 100644 --- a/projects/clr/hipamd/packaging/hip_hcc.txt +++ b/projects/clr/hipamd/packaging/hip_hcc.txt @@ -6,6 +6,8 @@ install(FILES @PROJECT_BINARY_DIR@/libhip_hcc_static.a DESTINATION lib) install(FILES @PROJECT_BINARY_DIR@/libhip_device.a DESTINATION lib) install(FILES @PROJECT_BINARY_DIR@/.hipInfo DESTINATION lib) install(FILES @hip_SOURCE_DIR@/src/hip_hc.ll @hip_SOURCE_DIR@/src/hip_hc_gfx803.ll DESTINATION lib) +install(FILES @PROJECT_BINARY_DIR@/hip-config.cmake @PROJECT_BINARY_DIR@/hip-config-version.cmake DESTINATION lib/cmake/hip) +install(FILES @hip_SOURCE_DIR@/packaging/hip-targets.cmake @hip_SOURCE_DIR@/packaging/hip-targets-release.cmake DESTINATION lib/cmake/hip) ############################# # Packaging steps From 3e7f31f5701d21b7de8f95a93c730432be1a95d1 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 13 Apr 2017 12:12:04 +0530 Subject: [PATCH 028/171] dtests should ignore HIP_PATH env var Change-Id: I27b1cdab6e6b799987dad3ce97b56c764b1b8867 [ROCm/clr commit: 6db8c7c69e43cb374f60a082e1b55811a1cfd3d4] --- projects/clr/hipamd/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/CMakeLists.txt b/projects/clr/hipamd/CMakeLists.txt index 94ed2a7562..b4e80625d9 100644 --- a/projects/clr/hipamd/CMakeLists.txt +++ b/projects/clr/hipamd/CMakeLists.txt @@ -372,14 +372,14 @@ endif() # Testing steps ############################# # Target: test -set(HIP_PATH ${CMAKE_INSTALL_PREFIX}) +set(HIP_ROOT_DIR ${CMAKE_INSTALL_PREFIX}) set(HIP_SRC_PATH ${CMAKE_CURRENT_SOURCE_DIR}) -execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_PATH}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET) +execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/cmake" "${HIP_ROOT_DIR}/cmake" RESULT_VARIABLE RUN_HIT ERROR_QUIET) if(${RUN_HIT} EQUAL 0) - execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_PATH}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET) + execute_process(COMMAND "${CMAKE_COMMAND}" -E copy_directory "${HIP_SRC_PATH}/bin" "${HIP_ROOT_DIR}/bin" RESULT_VARIABLE RUN_HIT ERROR_QUIET) endif() if(${RUN_HIT} EQUAL 0) - set(CMAKE_MODULE_PATH "${HIP_PATH}/cmake" ${CMAKE_MODULE_PATH}) + set(CMAKE_MODULE_PATH "${HIP_ROOT_DIR}/cmake" ${CMAKE_MODULE_PATH}) include(${HIP_SRC_PATH}/tests/hit/HIT.cmake) # Add tests From 2235d9014e8623012741d81a9296f028d7d13c4b Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 13 Apr 2017 12:38:38 +0530 Subject: [PATCH 029/171] Do not rebuild cmake cache by default Change-Id: Ie21e99beaa3465b54b5a6a77439c455f34de98b3 [ROCm/clr commit: b3b1ba1e5d3b8ba85c529c38b836d44a87caa9dd] --- projects/clr/hipamd/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/CMakeLists.txt b/projects/clr/hipamd/CMakeLists.txt index b4e80625d9..eee1a14a8a 100644 --- a/projects/clr/hipamd/CMakeLists.txt +++ b/projects/clr/hipamd/CMakeLists.txt @@ -142,7 +142,7 @@ add_to_config(_buildInfo COMPILE_HIP_ATP_MARKER) # Build steps ############################# # Rebuild cmake cache updates .hipInfo and .hipVersion -add_custom_target(update_build_and_version_info ALL COMMAND make rebuild_cache) +add_custom_target(update_build_and_version_info COMMAND make rebuild_cache) # Build clang hipify if enabled add_subdirectory(hipify-clang) From 732ce43e6df6a73c870b9e9d369c821406ea10ca Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Thu, 13 Apr 2017 14:41:11 +0530 Subject: [PATCH 030/171] FindHIP: Handle remove_item from empty lists Change-Id: I6adf31b32edeae9e8454b1a2528064cf3985fca1 [ROCm/clr commit: e45ee8c37a2f8a7dc1d50d0453c8a5066330ee02] --- projects/clr/hipamd/cmake/FindHIP.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/cmake/FindHIP.cmake b/projects/clr/hipamd/cmake/FindHIP.cmake index 0001436fee..1d71238ce6 100644 --- a/projects/clr/hipamd/cmake/FindHIP.cmake +++ b/projects/clr/hipamd/cmake/FindHIP.cmake @@ -514,7 +514,7 @@ macro(HIP_ADD_EXECUTABLE hip_target) # Separate the sources from the options HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) - list(REMOVE_ITEM _sources ${_source_files}) + list(REMOVE_ITEM _sources "${_source_files}") if("x${HCC_HOME}" STREQUAL "x") set(HCC_HOME "/opt/rocm/hcc") endif() @@ -530,7 +530,7 @@ macro(HIP_ADD_LIBRARY hip_target) # Separate the sources from the options HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) - list(REMOVE_ITEM _sources ${_source_files}) + list(REMOVE_ITEM _sources "${_source_files}") add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX}) endmacro() From 41b7984cd0e8b6889831b2a3708c2518ce4a0690 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 16 Apr 2017 14:22:48 -0500 Subject: [PATCH 031/171] Update bugs - Add CreateKernel, new signature for static kerns. [ROCm/clr commit: 710c83b73b8b0ec81a2925940a2f09e264dff988] --- projects/clr/hipamd/docs/markdown/hip_bugs.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/projects/clr/hipamd/docs/markdown/hip_bugs.md b/projects/clr/hipamd/docs/markdown/hip_bugs.md index 9452fae2fd..abb31d80e8 100644 --- a/projects/clr/hipamd/docs/markdown/hip_bugs.md +++ b/projects/clr/hipamd/docs/markdown/hip_bugs.md @@ -11,7 +11,13 @@ ### Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**` Some common code practices may lead to hipcc generating a error with the form : +``` undefined reference to `__hcLaunchKernel__ZN15vecAddNamespace6vecAddIidEEv16grid_launch_parmPT0_S3_S3_T_ +``` +Or: +``` +error: weak declaration cannot have internal linkage +``` Suggested workarounds: - Avoid use of static with kernel definition: @@ -26,6 +32,19 @@ namespace { } ``` +### Can't find kernels inside dynamic linked library + +HCC requires use of the "-Bdynamic" flag when creating a dynamic library which contains kernels. The dynamic flag causes the symbols to be created with a signature which allows HCC to discover and load the kernels in the dynamic library. This flag is often not set by default and must be added to the link step of the library. If not done, HCC will be unable to find the kernels defined in the library, and will emit a message such as: + +``` +HSADevice::CreateKernel(): Unable to create kernel" +``` + +To correct, add the following flag to hcc or hipcc: +``` +$ hipcc -Wl,-Bsymbolic ... +``` + ### What is the current limitation of HIP Generic Grid Launch method? 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak))); From 710efeea0cb4f2908afd556b5dfdc6f193c5ea41 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 18 Apr 2017 10:21:20 +0530 Subject: [PATCH 032/171] FindHIP: Apply remove_item on non-empty lists only Change-Id: Ib7fcb992d7e1bb679d4d86676fe3d980ba204815 [ROCm/clr commit: 137c04c536695b6aa812325921edfc93d4be1f99] --- projects/clr/hipamd/cmake/FindHIP.cmake | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/cmake/FindHIP.cmake b/projects/clr/hipamd/cmake/FindHIP.cmake index 1d71238ce6..5a5813ba0d 100644 --- a/projects/clr/hipamd/cmake/FindHIP.cmake +++ b/projects/clr/hipamd/cmake/FindHIP.cmake @@ -514,7 +514,9 @@ macro(HIP_ADD_EXECUTABLE hip_target) # Separate the sources from the options HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) - list(REMOVE_ITEM _sources "${_source_files}") + if(_source_files) + list(REMOVE_ITEM _sources ${_source_files}) + endif() if("x${HCC_HOME}" STREQUAL "x") set(HCC_HOME "/opt/rocm/hcc") endif() @@ -530,7 +532,9 @@ macro(HIP_ADD_LIBRARY hip_target) # Separate the sources from the options HIP_GET_SOURCES_AND_OPTIONS(_sources _cmake_options _hipcc_options _hcc_options _nvcc_options ${ARGN}) HIP_PREPARE_TARGET_COMMANDS(${hip_target} OBJ _generated_files _source_files ${_sources} ${_cmake_options} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) - list(REMOVE_ITEM _sources "${_source_files}") + if(_source_files) + list(REMOVE_ITEM _sources ${_source_files}) + endif() add_library(${hip_target} ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${hip_target} PROPERTIES LINKER_LANGUAGE ${HIP_C_OR_CXX}) endmacro() From 0339fbee526854f0c856054f07226393dda51c81 Mon Sep 17 00:00:00 2001 From: James Edwards Date: Mon, 17 Apr 2017 23:58:34 -0500 Subject: [PATCH 033/171] Fix RPM HIP packages from specifying /opt Change-Id: Iec3c3b81eef4c8888d425eefc80b12488a8d20a1 [ROCm/clr commit: c5e1235f6479e55dc2d87bd5ecf147152107e2e1] --- projects/clr/hipamd/packaging/hip_base.txt | 1 + projects/clr/hipamd/packaging/hip_doc.txt | 1 + projects/clr/hipamd/packaging/hip_hcc.txt | 1 + projects/clr/hipamd/packaging/hip_nvcc.txt | 1 + projects/clr/hipamd/packaging/hip_samples.txt | 1 + 5 files changed, 5 insertions(+) diff --git a/projects/clr/hipamd/packaging/hip_base.txt b/projects/clr/hipamd/packaging/hip_base.txt index a208bc3463..836a82657b 100644 --- a/projects/clr/hipamd/packaging/hip_base.txt +++ b/projects/clr/hipamd/packaging/hip_base.txt @@ -33,5 +33,6 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/projects/clr/hipamd/packaging/hip_doc.txt b/projects/clr/hipamd/packaging/hip_doc.txt index d5a0c471b1..6f602c84cf 100644 --- a/projects/clr/hipamd/packaging/hip_doc.txt +++ b/projects/clr/hipamd/packaging/hip_doc.txt @@ -36,5 +36,6 @@ set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/projects/clr/hipamd/packaging/hip_hcc.txt b/projects/clr/hipamd/packaging/hip_hcc.txt index 7118c32eb9..b0808aa0bc 100644 --- a/projects/clr/hipamd/packaging/hip_hcc.txt +++ b/projects/clr/hipamd/packaging/hip_hcc.txt @@ -46,5 +46,6 @@ if(@COMPILE_HIP_ATP_MARKER@) else() set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@") endif() +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/projects/clr/hipamd/packaging/hip_nvcc.txt b/projects/clr/hipamd/packaging/hip_nvcc.txt index ea4943f282..0d7c357623 100644 --- a/projects/clr/hipamd/packaging/hip_nvcc.txt +++ b/projects/clr/hipamd/packaging/hip_nvcc.txt @@ -25,5 +25,6 @@ set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") #set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, cuda >= 7.5") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/projects/clr/hipamd/packaging/hip_samples.txt b/projects/clr/hipamd/packaging/hip_samples.txt index f289f2a8e5..6d34a6fd40 100644 --- a/projects/clr/hipamd/packaging/hip_samples.txt +++ b/projects/clr/hipamd/packaging/hip_samples.txt @@ -24,5 +24,6 @@ set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) From 9f882824a960d8b519a56ade3ea59dc4308357fe Mon Sep 17 00:00:00 2001 From: James Edwards Date: Mon, 17 Apr 2017 23:58:34 -0500 Subject: [PATCH 034/171] Fix RPM HIP packages from specifying /opt Change-Id: Iec3c3b81eef4c8888d425eefc80b12488a8d20a1 [ROCm/clr commit: edfefc9aee9c24c4d0ae06c85788e5cf8b2e407b] --- projects/clr/hipamd/packaging/hip_base.txt | 1 + projects/clr/hipamd/packaging/hip_doc.txt | 1 + projects/clr/hipamd/packaging/hip_hcc.txt | 1 + projects/clr/hipamd/packaging/hip_nvcc.txt | 1 + projects/clr/hipamd/packaging/hip_samples.txt | 1 + 5 files changed, 5 insertions(+) diff --git a/projects/clr/hipamd/packaging/hip_base.txt b/projects/clr/hipamd/packaging/hip_base.txt index a208bc3463..836a82657b 100644 --- a/projects/clr/hipamd/packaging/hip_base.txt +++ b/projects/clr/hipamd/packaging/hip_base.txt @@ -33,5 +33,6 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "perl >= 5.0") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/projects/clr/hipamd/packaging/hip_doc.txt b/projects/clr/hipamd/packaging/hip_doc.txt index d5a0c471b1..6f602c84cf 100644 --- a/projects/clr/hipamd/packaging/hip_doc.txt +++ b/projects/clr/hipamd/packaging/hip_doc.txt @@ -36,5 +36,6 @@ set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/projects/clr/hipamd/packaging/hip_hcc.txt b/projects/clr/hipamd/packaging/hip_hcc.txt index 7118c32eb9..b0808aa0bc 100644 --- a/projects/clr/hipamd/packaging/hip_hcc.txt +++ b/projects/clr/hipamd/packaging/hip_hcc.txt @@ -46,5 +46,6 @@ if(@COMPILE_HIP_ATP_MARKER@) else() set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@") endif() +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/projects/clr/hipamd/packaging/hip_nvcc.txt b/projects/clr/hipamd/packaging/hip_nvcc.txt index ea4943f282..0d7c357623 100644 --- a/projects/clr/hipamd/packaging/hip_nvcc.txt +++ b/projects/clr/hipamd/packaging/hip_nvcc.txt @@ -25,5 +25,6 @@ set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") #set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, cuda >= 7.5") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) diff --git a/projects/clr/hipamd/packaging/hip_samples.txt b/projects/clr/hipamd/packaging/hip_samples.txt index f289f2a8e5..6d34a6fd40 100644 --- a/projects/clr/hipamd/packaging/hip_samples.txt +++ b/projects/clr/hipamd/packaging/hip_samples.txt @@ -24,5 +24,6 @@ set(CPACK_BINARY_RPM "ON") set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") +set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") include(CPack) From 6a3bee59f1f5d77646f2bcc108babb123aa59589 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Wed, 19 Apr 2017 10:47:40 -0500 Subject: [PATCH 035/171] fix broken header in NV path Change-Id: Ia3aff2a89d9ba49547f51ce03a3304dfab58ba25 [ROCm/clr commit: ee299695a876310d5de3cd35c876ab8bd44361d1] --- projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index 7e881df3ab..0cc40f32af 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -948,4 +948,6 @@ inline static hipChannelFormatDesc hipCreateChannelDesc() { return cudaCreateChannelDesc(); } -#endif +#endif //__CUDACC__ + +#endif //HIP_INCLUDE_HIP_NVCC_DETAIL_HIP_RUNTIME_API_H From 4ac3084503f2de4933dcaf182b78b645417688d2 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Wed, 19 Apr 2017 10:59:55 -0500 Subject: [PATCH 036/171] add support of hipLaunchKernelGGL on NV path Change-Id: I0aeafd80c2181873be385d985f1d8ed86a98d136 [ROCm/clr commit: c22420920070f5809756fbeb1f8fdeb32d6fed52] --- projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime.h index b4fa13f48c..80da388007 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime.h @@ -36,6 +36,10 @@ do {\ kernelName<<>>(0, ##__VA_ARGS__);\ } while(0) +#define hipLaunchKernelGGL(kernelName, numblocks, numthreads, memperblock, streamId, ...) \ +do {\ +kernelName<<>>(__VA_ARGS__);\ +} while(0) #define hipReadModeElementType cudaReadModeElementType From acacf26ed86efb23d0260bd453937eb867d7bc2f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 21 Apr 2017 09:01:34 -0500 Subject: [PATCH 037/171] Fix compilation error with nvcc (c++ nullptr) [ROCm/clr commit: f1f907cb514492656e4e1d298b1671cc07077dae] --- projects/clr/hipamd/samples/0_Intro/square/Makefile | 1 + projects/clr/hipamd/samples/0_Intro/square/square.hipref.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/samples/0_Intro/square/Makefile b/projects/clr/hipamd/samples/0_Intro/square/Makefile index 1e8cdba080..aa48cc5864 100644 --- a/projects/clr/hipamd/samples/0_Intro/square/Makefile +++ b/projects/clr/hipamd/samples/0_Intro/square/Makefile @@ -15,5 +15,6 @@ square.hip.out: square.hipref.cpp + clean: rm -f *.o *.out diff --git a/projects/clr/hipamd/samples/0_Intro/square/square.hipref.cpp b/projects/clr/hipamd/samples/0_Intro/square/square.hipref.cpp index 963ab63260..e694bfb8a4 100644 --- a/projects/clr/hipamd/samples/0_Intro/square/square.hipref.cpp +++ b/projects/clr/hipamd/samples/0_Intro/square/square.hipref.cpp @@ -83,7 +83,7 @@ int main(int argc, char *argv[]) const unsigned threadsPerBlock = 256; printf ("info: launch 'vector_square' kernel\n"); - hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, nullptr, C_d, A_d, N); + hipLaunchKernel(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N); printf ("info: copy Device2Host\n"); CHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); From 8805f2ce66007209b106d678366a580c53d13091 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 21 Apr 2017 21:46:30 +0300 Subject: [PATCH 038/171] [HIPIFY] Initial sync HIPIFY with HIP by CUDA Driver API data types. + Introduce CUDA_Driver_API_functions_supported_by_HIP.md. + Initial update of HIPIFY with CUDA driver data types. + Initial sync HIP types against CUDA Driver and Runtime API types. + Typo fixes. [ROCm/clr commit: 342c63d39868902d28bfd72666e9161fdabed0f0] --- ...A_Driver_API_functions_supported_by_HIP.md | 499 ++++++++ .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 1044 +++++++++++------ 2 files changed, 1157 insertions(+), 386 deletions(-) create mode 100644 projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md diff --git a/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md new file mode 100644 index 0000000000..3434d29a70 --- /dev/null +++ b/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -0,0 +1,499 @@ +# CUDA Driver API functions supported by HIP + +## **1. Data types used by CUDA driver** + +| **type** | **CUDA** | **HIP** | **CUDA description** | +|-------------:|---------------------------------------------------------------|------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| struct | `CUDA_ARRAY3D_DESCRIPTOR` | | | +| struct | `CUDA_ARRAY_DESCRIPTOR` | | | +| struct | `CUDA_MEMCPY2D` | | | +| struct | `CUDA_MEMCPY3D` | | | +| struct | `CUDA_MEMCPY3D_PEER` | | | +| struct | `CUDA_POINTER_ATTRIBUTE_P2P_TOKENS` | | | +| struct | `CUDA_RESOURCE_DESC` | | | +| struct | `CUDA_RESOURCE_VIEW_DESC` | | | +| struct | `CUdevprop` | `hipDeviceProp_t` | | +| struct | `CUipcEventHandle` | | | +| struct | `CUipcMemHandle` | | | +| enum |***`CUaddress_mode`*** | | Texture reference addressing modes | +| 0 |*`CU_TR_ADDRESS_MODE_WRAP`* | | Wrapping address mode | +| 1 |*`CU_TR_ADDRESS_MODE_CLAMP`* | | Clamp to edge address mode | +| 2 |*`CU_TR_ADDRESS_MODE_MIRROR`* | | Mirror address mode | +| 3 |*`CU_TR_ADDRESS_MODE_BORDER`* | | Border address mode | +| enum |***`CUarray_cubemap_face`*** | | Array indices for cube faces | +| 0x00 |*`CU_CUBEMAP_FACE_POSITIVE_X`* | | Positive X face of cubemap | +| 0x01 |*`CU_CUBEMAP_FACE_NEGATIVE_X`* | | Negative X face of cubemap | +| 0x02 |*`CU_CUBEMAP_FACE_POSITIVE_Y`* | | Positive Y face of cubemap | +| 0x03 |*`CU_CUBEMAP_FACE_NEGATIVE_Y`* | | Negative Y face of cubemap | +| 0x04 |*`CU_CUBEMAP_FACE_POSITIVE_Z`* | | Positive Z face of cubemap | +| 0x05 |*`CU_CUBEMAP_FACE_NEGATIVE_Z`* | | Negative Z face of cubemap | +| enum |***`CUarray_format`*** | | Array formats | +| 0x01 |*`CU_AD_FORMAT_UNSIGNED_INT8`* | | Unsigned 8-bit integers | +| 0x02 |*`CU_AD_FORMAT_UNSIGNED_INT16`* | | Unsigned 16-bit integers | +| 0x03 |*`CU_AD_FORMAT_UNSIGNED_INT32`* | | Unsigned 32-bit integers | +| 0x08 |*`CU_AD_FORMAT_SIGNED_INT8`* | | Signed 8-bit integers | +| 0x09 |*`CU_AD_FORMAT_SIGNED_INT16`* | | Signed 16-bit integers | +| 0x0a |*`CU_AD_FORMAT_SIGNED_INT32`* | | Signed 32-bit integers | +| 0x10 |*`CU_AD_FORMAT_HALF`* | | 16-bit floating point | +| 0x20 |*`CU_AD_FORMAT_FLOAT`* | | 32-bit floating point | +| enum |***`CUctx_flags`*** | | Context creation flags | +| 0x00 |*`CU_CTX_SCHED_AUTO`* | | Automatic scheduling | +| 0x01 |*`CU_CTX_SCHED_SPIN`* | | Set spin as default scheduling | +| 0x02 |*`CU_CTX_SCHED_YIELD`* | | Set yield as default scheduling | +| 0x04 |*`CU_CTX_SCHED_BLOCKING_SYNC`* | | Set blocking synchronization as default scheduling | +| 0x04 |*`CU_CTX_BLOCKING_SYNC`* | | Set blocking synchronization as default scheduling Deprecated. This flag was deprecated as of CUDA 4.0 and was replaced with CU_CTX_SCHED_BLOCKING_SYNC.| +| 0x07 |*`CU_CTX_SCHED_MASK`* | | | +| 0x08 |*`CU_CTX_MAP_HOST`* | | Support mapped pinned allocations | +| 0x10 |*`CU_CTX_LMEM_RESIZE_TO_MAX`* | | Keep local memory allocation after launch | +| 0x1f |*`CU_CTX_FLAGS_MASK`* | | | +| enum |***`CUdevice_attribute`*** | | Device properties | +| 1 |*`CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK`* |*`hipDeviceAttributeMaxThreadsPerBlock`* | Maximum number of threads per block | +| 2 |*`CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X`* |*`hipDeviceAttributeMaxBlockDimX`* | Maximum block dimension X | +| 3 |*`CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y`* |*`hipDeviceAttributeMaxBlockDimY`* | Maximum block dimension Y | +| 4 |*`CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z`* |*`hipDeviceAttributeMaxBlockDimZ`* | Maximum block dimension Z | +| 5 |*`CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X`* |*`hipDeviceAttributeMaxGridDimX`* | Maximum grid dimension X | +| 6 |*`CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y`* |*`hipDeviceAttributeMaxGridDimY`* | Maximum grid dimension Y | +| 7 |*`CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z`* |*`hipDeviceAttributeMaxGridDimZ`* | Maximum grid dimension Y | +| 8 |*`CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK`* |*`hipDeviceAttributeMaxSharedMemoryPerBlock`* | Maximum shared memory available per block in bytes | +| 8 |*`CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK`* |*`hipDeviceAttributeMaxSharedMemoryPerBlock`* | Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK | +| 9 |*`CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY`* |*`hipDeviceAttributeTotalConstantMemory`* | Memory available on device for __constant__ variables in a CUDA C kernel in bytes | +| 10 |*`CU_DEVICE_ATTRIBUTE_WARP_SIZE`* |*`hipDeviceAttributeWarpSize`* | Warp size in threads | +| 11 |*`CU_DEVICE_ATTRIBUTE_MAX_PITCH`* | | Maximum pitch in bytes allowed by memory copies | +| 12 |*`CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK`* |*`hipDeviceAttributeMaxRegistersPerBlock`* | Maximum number of 32-bit registers available per block | +| 12 |*`CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK`* |*`hipDeviceAttributeMaxRegistersPerBlock`* | Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK | +| 13 |*`CU_DEVICE_ATTRIBUTE_CLOCK_RATE`* |*`hipDeviceAttributeClockRate`* | Typical clock frequency in kilohertz | +| 14 |*`CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT`* | | Alignment requirement for textures | +| 15 |*`CU_DEVICE_ATTRIBUTE_GPU_OVERLAP`* | | Device can possibly copy memory and execute a kernel concurrently. Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT| +| 16 |*`CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT`* |*`hipDeviceAttributeMultiprocessorCount`* | Number of multiprocessors on device | +| 17 |*`CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT`* | | Specifies whether there is a run time limit on kernels | +| 18 |*`CU_DEVICE_ATTRIBUTE_INTEGRATED`* | | Device is integrated with host memory | +| 19 |*`CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY`* | | Device can map host memory into CUDA address space | +| 20 |*`CU_DEVICE_ATTRIBUTE_COMPUTE_MODE`* |*`hipDeviceAttributeComputeMode`* | Compute mode (See CUcomputemode for details) | +| 21 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH`* | | Maximum 1D texture width | +| 22 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH`* | | Maximum 2D texture width | +| 23 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT`* | | Maximum 2D texture height | +| 24 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH`* | | Maximum 3D texture width | +| 25 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT`* | | Maximum 3D texture height | +| 26 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH`* | | Maximum 3D texture depth | +| 27 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH`* | | Maximum 2D layered texture width | +| 28 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT`* | | Maximum 2D layered texture height | +| 29 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS`* | | Maximum layers in a 2D layered texture | +| 27 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH`* | | Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH | +| 28 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT`* | | Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT | +| 29 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES`* | | Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS | +| 30 |*`CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT`* | | Alignment requirement for surfaces | +| 31 |*`CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS`* |*`hipDeviceAttributeConcurrentKernels`* | Device can possibly execute multiple kernels concurrently | +| 32 |*`CU_DEVICE_ATTRIBUTE_ECC_ENABLED`* | | Device has ECC support enabled | +| 33 |*`CU_DEVICE_ATTRIBUTE_PCI_BUS_ID`* |*`hipDeviceAttributePciBusId`* | PCI bus ID of the device | +| 34 |*`CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID`* |*`hipDeviceAttributePciDeviceId`* | PCI device ID of the device | +| 35 |*`CU_DEVICE_ATTRIBUTE_TCC_DRIVER`* | | Device is using TCC driver model | +| 36 |*`CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE`* |*`hipDeviceAttributeMemoryClockRate`* | Peak memory clock frequency in kilohertz | +| 37 |*`CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH`* |*`hipDeviceAttributeMemoryBusWidth`* | Global memory bus width in bits | +| 38 |*`CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE`* |*`hipDeviceAttributeL2CacheSize`* | Size of L2 cache in bytes | +| 39 |*`CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR`* |*`hipDeviceAttributeMaxThreadsPerMultiProcessor`* | Maximum resident threads per multiprocessor | +| 40 |*`CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT`* | | Number of asynchronous engines | +| 41 |*`CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING`* | | Device shares a unified address space with the host | +| 42 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH`* | | Maximum 1D layered texture width | +| 43 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS`* | | Maximum layers in a 1D layered texture | +| 44 |*`CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER`* | | Deprecated, do not use | +| 45 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH`* | | Maximum 2D texture width if CUDA_ARRAY3D_TEXTURE_GATHER is set | +| 46 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT`* | | Maximum 2D texture height if CUDA_ARRAY3D_TEXTURE_GATHER is set | +| 47 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE`* | | Alternate maximum 3D texture width | +| 48 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE`* | | Alternate maximum 3D texture height | +| 49 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE`* | | Alternate maximum 3D texture depth | +| 50 |*`CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID`* | | PCI domain ID of the device | +| 51 |*`CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT`* | | Pitch alignment requirement for textures | +| 52 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH`* | | Maximum cubemap texture width/height | +| 53 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH`* | | Maximum cubemap layered texture width/height | +| 54 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS`* | | Maximum layers in a cubemap layered texture | +| 55 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH`* | | Maximum 1D surface width | +| 56 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH`* | | Maximum 2D surface width | +| 57 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT`* | | Maximum 2D surface height | +| 58 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH`* | | Maximum 3D surface width | +| 59 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT`* | | Maximum 3D surface height | +| 60 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH`* | | Maximum 3D surface depth | +| 61 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH`* | | Maximum 1D layered surface width | +| 62 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS`* | | Maximum layers in a 1D layered surface | +| 63 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH`* | | Maximum 2D layered surface width | +| 64 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT`* | | Maximum 2D layered surface height | +| 65 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS`* | | Maximum layers in a 2D layered surface | +| 66 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH`* | | Maximum cubemap surface width | +| 67 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH`* | | Maximum cubemap layered surface width | +| 68 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS`* | | Maximum layers in a cubemap layered surface | +| 69 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH`* | | Maximum 1D linear texture width | +| 70 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH`* | | Maximum 2D linear texture width | +| 71 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT`* | | Maximum 2D linear texture height | +| 72 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH`* | | Maximum 2D linear texture pitch in bytes | +| 73 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH`* | | Maximum mipmapped 2D texture width | +| 74 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT`* | | Maximum mipmapped 2D texture height | +| 75 |*`CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR`* |*`hipDeviceAttributeComputeCapabilityMajor`* | Major compute capability version number | +| 76 |*`CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR`* |*`hipDeviceAttributeComputeCapabilityMinor`* | Minor compute capability version number | +| 77 |*`CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH`* | | Maximum mipmapped 1D texture width | +| 78 |*`CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED`* | | Device supports stream priorities | +| 79 |*`CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED`* | | Device supports caching globals in L1 | +| 80 |*`CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED`* | | Device supports caching locals in L1 | +| 81 |*`CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR`* |*`hipDeviceAttributeMaxSharedMemoryPerMultiprocessor`* | Maximum shared memory available per multiprocessor in bytes | +| 82 |*`CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR`* | | Maximum number of 32-bit registers available per multiprocessor | +| 83 |*`CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY`* |*`hipDeviceAttributeManagedMemory`* | Device can allocate managed memory on this system | +| 84 |*`CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD`* | | Device is on a multi-GPU board | +| 85 |*`CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID`* | | Unique id for a group of devices on the same multi-GPU board | +| 86 |*`CU_DEVICE_ATTRIBUTE_MAX`* | | | +| enum |***`CUevent_flags`*** | | Event creation flags | +| 0x00 |*`CU_EVENT_DEFAULT`* |*`hipEventDefault`* | Default event flag | +| 0x01 |*`CU_EVENT_BLOCKING_SYNC`* |*`hipEventBlockingSync`* | Event uses blocking synchronization | +| 0x02 |*`CU_EVENT_DISABLE_TIMING`* |*`hipEventDisableTiming`* | Event will not record timing data | +| 0x04 |*`CU_EVENT_INTERPROCESS`* |*`hipEventInterprocess`* | Event is suitable for interprocess use. CU_EVENT_DISABLE_TIMING must be set | +| enum |***`CUfilter_mode`*** |***`hipTextureFilterMode`*** | Texture reference filtering modes | +| 0 |*`CU_TR_FILTER_MODE_POINT`* |*`hipFilterModePoint`* | Point filter mode | +| 1 |*`CU_TR_FILTER_MODE_LINEAR`* |*`hipFilterModeLinear`* | Linear filter mode | +| enum |***`CUfunc_cache`*** |***`hipFuncCache`*** | Function cache configurations | +| 0x00 |*`CU_FUNC_CACHE_PREFER_NONE`* |*`hipFuncCachePreferNone`* | no preference for shared memory or L1 (default) | +| 0x01 |*`CU_FUNC_CACHE_PREFER_SHARED`* |*`hipFuncCachePreferShared`* | prefer larger shared memory and smaller L1 cache | +| 0x02 |*`CU_FUNC_CACHE_PREFER_L1`* |*`hipFuncCachePreferL1`* | prefer larger L1 cache and smaller shared memory | +| 0x03 |*`CU_FUNC_CACHE_PREFER_EQUAL`* |*`hipFuncCachePreferEqual`* | prefer equal sized L1 cache and shared memory | +| enum |***`CUfunction_attribute`*** | | Function properties | +| 0 |*`CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK`* | | The maximum number of threads per block, beyond which a launch of the function would fail. This number depends on both the function and the device on which the function is currently loaded. | +| 1 |*`CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES`* | | The size in bytes of statically-allocated shared memory required by this function. This does not include dynamically-allocated shared memory requested by the user at runtime. | +| 2 |*`CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES`* | | The size in bytes of user-allocated constant memory required by this function. | +| 3 |*`CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES`* | | The size in bytes of local memory used by each thread of this function. | +| 4 |*`CU_FUNC_ATTRIBUTE_NUM_REGS`* | | The number of registers used by each thread of this function. | +| 5 |*`CU_FUNC_ATTRIBUTE_PTX_VERSION`* | | The PTX virtual architecture version for which the function was compiled. This value is the major PTX version * 10 + the minor PTX version, so a PTX version 1.3 function would return the value 13. Note that this may return the undefined value of 0 for cubins compiled prior to CUDA 3.0. | +| 6 |*`CU_FUNC_ATTRIBUTE_BINARY_VERSION`* | | The binary architecture version for which the function was compiled. This value is the major binary version * 10 + the minor binary version, so a binary version 1.3 function would return the value 13. Note that this will return a value of 10 for legacy cubins that do not have a properly-encoded binary architecture version. | +| 7 |*`CU_FUNC_ATTRIBUTE_CACHE_MODE_CA`* | | The attribute to indicate whether the function has been compiled with user specified option "-Xptxas --dlcm=ca" set. | +| 8 |*`CU_FUNC_ATTRIBUTE_MAX`* | | | +| enum |***`CUgraphicsMapResourceFlags`*** | | Flags for mapping and unmapping interop resources | +| 0x00 |*`CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE`* | | | +| 0x01 |*`CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY`* | | | +| 0x02 |*`CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD`* | | | +| enum |***`CUgraphicsRegisterFlags`*** | | Flags to register a graphics resource | +| 0x00 |*`CU_GRAPHICS_REGISTER_FLAGS_NONE`* | | | +| 0x01 |*`CU_GRAPHICS_REGISTER_FLAGS_READ_ONLY`* | | | +| 0x02 |*`CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD`* | | | +| 0x04 |*`CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST`* | | | +| 0x08 |*`CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER`* | | | +| enum |***`CUipcMem_flags`*** | | CUDA Ipc Mem Flags | +| 0x1 |*`CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS`* |*`hipIpcMemLazyEnablePeerAccess`* | Automatically enable peer access between remote devices as needed | +| enum |***`CUjit_cacheMode`*** | | Caching modes for dlcm | +| 0 |*`CU_JIT_CACHE_OPTION_NONE`* | | Compile with no -dlcm flag specified | +| |*`CU_JIT_CACHE_OPTION_CG`* | | Compile with L1 cache disabled | +| |*`CU_JIT_CACHE_OPTION_CA`* | | Compile with L1 cache enabled | +| enum |***`CUjit_fallback`*** | | Cubin matching fallback strategies | +| 0 |*`CU_PREFER_PTX`* | | Prefer to compile ptx if exact binary match not found | +| |*`CU_PREFER_BINARY`* | | Prefer to fall back to compatible binary code if exact match not found | +| enum |***`CUjit_option`*** | | Online compiler and linker options | +| 0 |*`CU_JIT_MAX_REGISTERS`* | | Max number of registers that a thread may use. Option type: unsigned int Applies to: compiler only. | +| |*`CU_JIT_THREADS_PER_BLOCK`* | | IN: Specifies minimum number of threads per block to target compilation for OUT: Returns the number of threads the compiler actually targeted. This restricts the resource utilization fo the compiler (e.g. max registers) such that a block with the given number of threads should be able to launch based on register limitations. Note, this option does not currently take into account any other resource limitations, such as shared memory utilization. Cannot be combined with CU_JIT_TARGET. Option type: unsigned int Applies to: compiler only. | +| |*`CU_JIT_WALL_TIME`* | | Overwrites the option value with the total wall clock time, in milliseconds, spent in the compiler and linker Option type: float Applies to: compiler and linker. | +| |*`CU_JIT_INFO_LOG_BUFFER`* | | Pointer to a buffer in which to print any log messages that are informational in nature (the buffer size is specified via option CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES) Option type: char * Applies to: compiler and linker. | +| |*`CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES`* | | IN: Log buffer size in bytes. Log messages will be capped at this size (including null terminator) OUT: Amount of log buffer filled with messages Option type: unsigned int Applies to: compiler and linker. | +| |*`CU_JIT_OPTIMIZATION_LEVEL`* | | Level of optimizations to apply to generated code (0 - 4), with 4 being the default and highest level of optimizations. Option type: unsigned int Applies to: compiler only. | +| |*`CU_JIT_TARGET_FROM_CUCONTEXT`* | | No option value required. Determines the target based on the current attached context (default) Option type: No option value needed Applies to: compiler and linker. | +| |*`CU_JIT_TARGET`* | | Target is chosen based on supplied CUjit_target. Cannot be combined with CU_JIT_THREADS_PER_BLOCK. Option type: unsigned int for enumerated type CUjit_target Applies to: compiler and linker. | +| |*`CU_JIT_FALLBACK_STRATEGY`* | | Specifies choice of fallback strategy if matching cubin is not found. Choice is based on supplied CUjit_fallback. This option cannot be used with cuLink* APIs as the linker requires exact matches. Option type: unsigned int for enumerated type CUjit_fallback Applies to: compiler only. | +| |*`CU_JIT_GENERATE_DEBUG_INFO`* | | Specifies whether to create debug information in output (-g) (0: false, default) Option type: int Applies to: compiler and linker. | +| |*`CU_JIT_LOG_VERBOSE`* | | Generate verbose log messages (0: false, default) Option type: int Applies to: compiler and linker. | +| |*`CU_JIT_GENERATE_LINE_INFO`* | | Generate line number information (-lineinfo) (0: false, default) Option type: int Applies to: compiler only. | +| |*`CU_JIT_CACHE_MODE`* | | Specifies whether to enable caching explicitly (-dlcm) Choice is based on supplied CUjit_cacheMode_enum. Option type: unsigned int for enumerated type CUjit_cacheMode_enum Applies to: compiler only. | +| |*`CU_JIT_NUM_OPTIONS`* | | | +| enum |***`CUjit_target`*** | | Online compilation targets | +| 10 |*`CU_TARGET_COMPUTE_10`* | | Compute device class 1.0. | +| 11 |*`CU_TARGET_COMPUTE_11`* | | Compute device class 1.1. | +| 12 |*`CU_TARGET_COMPUTE_12`* | | Compute device class 1.2. | +| 13 |*`CU_TARGET_COMPUTE_13`* | | Compute device class 1.3. | +| 20 |*`CU_TARGET_COMPUTE_20`* | | Compute device class 2.0. | +| 21 |*`CU_TARGET_COMPUTE_21`* | | Compute device class 2.1. | +| 30 |*`CU_TARGET_COMPUTE_30`* | | Compute device class 3.0. | +| 32 |*`CU_TARGET_COMPUTE_32`* | | Compute device class 3.2. | +| 35 |*`CU_TARGET_COMPUTE_35`* | | Compute device class 3.5. | +| 37 |*`CU_TARGET_COMPUTE_37`* | | Compute device class 3.7. | +| 50 |*`CU_TARGET_COMPUTE_50`* | | Compute device class 5.0. | +| 52 |*`CU_TARGET_COMPUTE_52`* | | Compute device class 5.2. | +| enum |***`CUjitInputType`*** | | Device code formats | +| 0 |*`CU_JIT_INPUT_CUBIN`* | | Compiled device-class-specific device code Applicable options: none. | +| |*`CU_JIT_INPUT_PTX`* | | PTX source code Applicable options: PTX compiler options. | +| |*`CU_JIT_INPUT_FATBINARY`* | | Bundle of multiple cubins and/or PTX of some device code Applicable options: PTX compiler options, CU_JIT_FALLBACK_STRATEGY. | +| |*`CU_JIT_INPUT_OBJECT`* | | Host object with embedded device code Applicable options: PTX compiler options, CU_JIT_FALLBACK_STRATEGY. | +| |*`CU_JIT_INPUT_LIBRARY`* | | Archive of host objects with embedded device code Applicable options: PTX compiler options, CU_JIT_FALLBACK_STRATEGY. | +| |*`CU_JIT_NUM_INPUT_TYPES`* | | | +| enum |***`CUlimit`*** |***`hipLimit_t`*** | Limits | +| 0x00 |*`CU_LIMIT_STACK_SIZE`* | | GPU thread stack size. | +| 0x01 |*`CU_LIMIT_PRINTF_FIFO_SIZE`* | | GPU printf FIFO size. | +| 0x02 |*`CU_LIMIT_MALLOC_HEAP_SIZE`* |*`hipLimitMallocHeapSize`* | GPU malloc heap size. | +| 0x03 |*`CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH`* | | GPU device runtime launch synchronize depth. | +| 0x04 |*`CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT`* | | GPU device runtime pending launch count. | +| |*`CU_LIMIT_MAX`* | | | +| enum |***`CUmemAttach_flags`*** | | CUDA Mem Attach Flags | +| 0x1 |*`CU_MEM_ATTACH_GLOBAL`* | | Memory can be accessed by any stream on any device. | +| 0x2 |*`CU_MEM_ATTACH_HOST`* | | Memory cannot be accessed by any stream on any device. | +| 0x4 |*`CU_MEM_ATTACH_SINGLE`* | | Memory can only be accessed by a single stream on the associated device. | +| enum |***`CUmemorytype`*** | | Memory types | +| 0x01 |*`CU_MEMORYTYPE_HOST`* | | Host memory | +| 0x02 |*`CU_MEMORYTYPE_DEVICE`* | | Device memory | +| 0x03 |*`CU_MEMORYTYPE_ARRAY`* | | Array memory | +| 0x04 |*`CU_MEMORYTYPE_UNIFIED`* | | Unified device or host memory | +| enum |***`CUoccupancy_flags`*** | | Occupancy calculator flag | +| 0x00 |*`CU_OCCUPANCY_DEFAULT`* | | Default behavior | +| 0x01 |*`CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE`* | | Assume global caching is enabled and cannot be automatically turned off | +| enum |***`CUpointer_attribute`*** | | Pointer information | +| 1 |*`CU_POINTER_ATTRIBUTE_CONTEXT`* | | The CUcontext on which a pointer was allocated or registered | +| 2 |*`CU_POINTER_ATTRIBUTE_MEMORY_TYPE`* | | The CUmemorytype describing the physical location of a pointer | +| 3 |*`CU_POINTER_ATTRIBUTE_DEVICE_POINTER`* | | The address at which a pointer's memory may be accessed on the device | +| 4 |*`CU_POINTER_ATTRIBUTE_HOST_POINTER`* | | The address at which a pointer's memory may be accessed on the host | +| 5 |*`CU_POINTER_ATTRIBUTE_P2P_TOKENS`* | | A pair of tokens for use with the nv-p2p.h Linux kernel interface | +| 6 |*`CU_POINTER_ATTRIBUTE_SYNC_MEMOPS`* | | Synchronize every synchronous memory operation initiated on this region | +| 7 |*`CU_POINTER_ATTRIBUTE_BUFFER_ID`* | | A process-wide unique ID for an allocated memory region | +| 8 |*`CU_POINTER_ATTRIBUTE_IS_MANAGED`* | | Indicates if the pointer points to managed memory | +| enum |***`CUmemorytype`*** | | Resource types | +| 0x00 |*`CU_RESOURCE_TYPE_ARRAY`* | | Array resoure | +| 0x01 |*`CU_RESOURCE_TYPE_MIPMAPPED_ARRAY`* | | Mipmapped array resource | +| 0x02 |*`CU_RESOURCE_TYPE_LINEAR`* | | Linear resource | +| 0x03 |*`CU_RESOURCE_TYPE_PITCH2D`* | | Pitch 2D resource | +| enum |***`CUresourceViewFormat`*** | | Resource view format | +| 0x00 |*`CU_RES_VIEW_FORMAT_NONE`* | | No resource view format (use underlying resource format) | +| 0x01 |*`CU_RES_VIEW_FORMAT_UINT_1X8`* | | 1 channel unsigned 8-bit integers | +| 0x02 |*`CU_RES_VIEW_FORMAT_UINT_2X8`* | | 2 channel unsigned 8-bit integers | +| 0x03 |*`CU_RES_VIEW_FORMAT_UINT_4X8`* | | 4 channel unsigned 8-bit integers | +| 0x04 |*`CU_RES_VIEW_FORMAT_SINT_1X8`* | | 1 channel signed 8-bit integers | +| 0x05 |*`CU_RES_VIEW_FORMAT_SINT_2X8`* | | 2 channel signed 8-bit integers | +| 0x06 |*`CU_RES_VIEW_FORMAT_SINT_4X8`* | | 4 channel signed 8-bit integers | +| 0x07 |*`CU_RES_VIEW_FORMAT_UINT_1X16`* | | 1 channel unsigned 16-bit integers | +| 0x08 |*`CU_RES_VIEW_FORMAT_UINT_2X16`* | | 2 channel unsigned 16-bit integers | +| 0x09 |*`CU_RES_VIEW_FORMAT_UINT_4X16`* | | 4 channel unsigned 16-bit integers | +| 0x0a |*`CU_RES_VIEW_FORMAT_SINT_1X16`* | | 1 channel signed 16-bit integers | +| 0x0b |*`CU_RES_VIEW_FORMAT_SINT_2X16`* | | 2 channel signed 16-bit integers | +| 0x0c |*`CU_RES_VIEW_FORMAT_SINT_4X16`* | | 4 channel signed 16-bit integers | +| 0x0d |*`CU_RES_VIEW_FORMAT_UINT_1X32`* | | 1 channel unsigned 32-bit integers | +| 0x0e |*`CU_RES_VIEW_FORMAT_UINT_2X32`* | | 2 channel unsigned 32-bit integers | +| 0x0f |*`CU_RES_VIEW_FORMAT_UINT_4X32`* | | 4 channel unsigned 32-bit integers | +| 0x10 |*`CU_RES_VIEW_FORMAT_SINT_1X32`* | | 1 channel signed 32-bit integers | +| 0x11 |*`CU_RES_VIEW_FORMAT_SINT_2X32`* | | 2 channel signed 32-bit integers | +| 0x12 |*`CU_RES_VIEW_FORMAT_SINT_4X32`* | | 4 channel signed 32-bit integers | +| 0x13 |*`CU_RES_VIEW_FORMAT_FLOAT_1X16`* | | 1 channel 16-bit floating point | +| 0x14 |*`CU_RES_VIEW_FORMAT_FLOAT_2X16`* | | 2 channel 16-bit floating point | +| 0x15 |*`CU_RES_VIEW_FORMAT_FLOAT_4X16`* | | 4 channel 16-bit floating point | +| 0x16 |*`CU_RES_VIEW_FORMAT_FLOAT_1X32`* | | 1 channel 32-bit floating point | +| 0x17 |*`CU_RES_VIEW_FORMAT_FLOAT_2X32`* | | 2 channel 32-bit floating point | +| 0x18 |*`CU_RES_VIEW_FORMAT_FLOAT_4X32`* | | 4 channel 32-bit floating point | +| 0x19 |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC1`* | | Block compressed 1 | +| 0x1a |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC3`* | | Block compressed 2 | +| 0x1b |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC3`* | | Block compressed 3 | +| 0x1c |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC4`* | | Block compressed 4 unsigned | +| 0x1d |*`CU_RES_VIEW_FORMAT_SIGNED_BC4`* | | Block compressed 4 signed | +| 0x1e |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC5`* | | Block compressed 5 unsigned | +| 0x1f |*`CU_RES_VIEW_FORMAT_SIGNED_BC5`* | | Block compressed 5 signed | +| 0x20 |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC6H`* | | Block compressed 6 unsigned half-float | +| 0x21 |*`CU_RES_VIEW_FORMAT_SIGNED_BC6H`* | | Block compressed 6 signed half-float | +| 0x22 |*`CU_RES_VIEW_FORMAT_UNSIGNED_BC7`* | | Block compressed 7 | +| enum |***`CUresult`*** |***`hipError_t`*** | Error codes | +| 0 |*`CUDA_SUCCESS`* |*`hipSuccess`* | The API call returned with no errors. In the case of query calls, this can also mean that the operation being queried is complete (see cuEventQuery() and cuStreamQuery()). | +| 1 |*`CUDA_ERROR_INVALID_VALUE`* |*`hipErrorInvalidValue`* | This indicates that one or more of the parameters passed to the API call is not within an acceptable range of values. | +| 2 |*`CUDA_ERROR_OUT_OF_MEMORY`* |*`hipErrorMemoryAllocation`* | The API call failed because it was unable to allocate enough memory to perform the requested operation. | +| 3 |*`CUDA_ERROR_NOT_INITIALIZED`* |*`hipErrorNotInitialized`* | This indicates that the CUDA driver has not been initialized with cuInit() or that initialization has failed. | +| 4 |*`CUDA_ERROR_DEINITIALIZED`* |*`hipErrorDeinitialized`* | This indicates that the CUDA driver is in the process of shutting down. | +| 5 |*`CUDA_ERROR_PROFILER_DISABLED`* |*`hipErrorProfilerDisabled`* | This indicates profiler is not initialized for this run. This can happen when the application is running with external profiling tools like visual profiler. | +| 6 |*`CUDA_ERROR_PROFILER_NOT_INITIALIZED`* |*`hipErrorProfilerNotInitialized`* | Deprecated This error return is deprecated as of CUDA 5.0. It is no longer an error to attempt to enable/disable the profiling via cuProfilerStart or cuProfilerStop without initialization. | +| 7 |*`CUDA_ERROR_PROFILER_ALREADY_STARTED`* |*`hipErrorProfilerAlreadyStarted`* | Deprecated This error return is deprecated as of CUDA 5.0. It is no longer an error to call cuProfilerStart() when profiling is already enabled. | +| 8 |*`CUDA_ERROR_PROFILER_ALREADY_STOPPED`* |*`hipErrorProfilerAlreadyStopped`* | Deprecated This error return is deprecated as of CUDA 5.0. It is no longer an error to call cuProfilerStop() when profiling is already disabled. | +| 100 |*`CUDA_ERROR_NO_DEVICE`* |*`hipErrorNoDevice`* | This indicates that no CUDA-capable devices were detected by the installed CUDA driver. | +| 101 |*`CUDA_ERROR_INVALID_DEVICE`* |*`hipErrorInvalidDevice`* | This indicates that the device ordinal supplied by the user does not correspond to a valid CUDA device. | +| 200 |*`CUDA_ERROR_INVALID_IMAGE`* |*`hipErrorInvalidImage`* | This indicates that the device kernel image is invalid. This can also indicate an invalid CUDA module. | +| 201 |*`CUDA_ERROR_INVALID_CONTEXT`* |*`hipErrorInvalidContext`* | This most frequently indicates that there is no context bound to the current thread. This can also be returned if the context passed to an API call is not a valid handle (such as a context that has had cuCtxDestroy() invoked on it). This can also be returned if a user mixes different API versions (i.e. 3010 context with 3020 API calls). See cuCtxGetApiVersion() for more details. | +| 202 |*`CUDA_ERROR_CONTEXT_ALREADY_CURRENT`* |*`hipErrorContextAlreadyCurrent`* | This indicated that the context being supplied as a parameter to the API call was already the active context. Deprecated This error return is deprecated as of CUDA 3.2. It is no longer an error to attempt to push the active context via cuCtxPushCurrent(). | +| 205 |*`CUDA_ERROR_MAP_FAILED`* |*`hipErrorMapFailed`* | This indicates that a map or register operation has failed. | +| 206 |*`CUDA_ERROR_UNMAP_FAILED`* |*`hipErrorUnmapFailed`* | This indicates that an unmap or unregister operation has failed. | +| 207 |*`CUDA_ERROR_ARRAY_IS_MAPPED`* |*`hipErrorArrayIsMapped`* | This indicates that the specified array is currently mapped and thus cannot be destroyed. | +| 208 |*`CUDA_ERROR_ALREADY_MAPPED`* |*`hipErrorAlreadyMapped`* | This indicates that the resource is already mapped. | +| 209 |*`CUDA_ERROR_NO_BINARY_FOR_GPU`* |*`hipErrorNoBinaryForGpu* | This indicates that there is no kernel image available that is suitable for the device. This can occur when a user specifies code generation options for a particular CUDA source file that do not include the corresponding device configuration. | +| 210 |*`CUDA_ERROR_ALREADY_ACQUIRED`* |*`hipErrorAlreadyAcquired* | This indicates that a resource has already been acquired. | +| 211 |*`CUDA_ERROR_NOT_MAPPED`* |*`hipErrorNotMapped`* | This indicates that a resource is not mapped. | +| 212 |*`CUDA_ERROR_NOT_MAPPED_AS_ARRAY`* |*`hipErrorNotMappedAsArray`* | This indicates that a mapped resource is not available for access as an array. | +| 213 |*`CUDA_ERROR_NOT_MAPPED_AS_POINTER`* |*`hipErrorNotMappedAsPointer`* | This indicates that a mapped resource is not available for access as a pointer. | +| 214 |*`CUDA_ERROR_ECC_UNCORRECTABLE`* |*`hipErrorECCNotCorrectable`* | This indicates that an uncorrectable ECC error was detected during execution. | +| 215 |*`CUDA_ERROR_UNSUPPORTED_LIMIT`* |*`hipErrorUnsupportedLimit`* | This indicates that the CUlimit passed to the API call is not supported by the active device. | +| 216 |*`CUDA_ERROR_CONTEXT_ALREADY_IN_USE`* |*`hipErrorContextAlreadyInUse`* | This indicates that the CUcontext passed to the API call can only be bound to a single CPU thread at a time but is already bound to a CPU thread. | +| 217 |*`CUDA_ERROR_PEER_ACCESS_UNSUPPORTED`* |*`hipErrorPeerAccessUnsupported`* | This indicates that peer access is not supported across the given devices. | +| 218 |*`CUDA_ERROR_INVALID_PTX`* |*`hipErrorInvalidKernelFile`* | This indicates that a PTX JIT compilation failed. | +| 219 |*`CUDA_ERROR_INVALID_GRAPHICS_CONTEXT`* |*`hipErrorInvalidGraphicsContext`* | This indicates an error with OpenGL or DirectX context. | +| 300 |*`CUDA_ERROR_INVALID_SOURCE`* |*`hipErrorInvalidSource`* | This indicates that the device kernel source is invalid. | +| 301 |*`CUDA_ERROR_FILE_NOT_FOUND`* |*`hipErrorFileNotFound`* | This indicates that the file specified was not found. | +| 302 |*`CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND`* |*`hipErrorSharedObjectSymbolNotFound`* | This indicates that a link to a shared object failed to resolve. | +| 303 |*`CUDA_ERROR_SHARED_OBJECT_INIT_FAILED`* |*`hipErrorSharedObjectInitFailed`* | This indicates that initialization of a shared object failed. | +| 304 |*`CUDA_ERROR_OPERATING_SYSTEM`* |*`hipErrorOperatingSystem`* | This indicates that an OS call failed. | +| 400 |*`CUDA_ERROR_INVALID_HANDLE`* |*`hipErrorInvalidResourceHandle`* | This indicates that a resource handle passed to the API call was not valid. Resource handles are opaque types like CUstream and CUevent. | +| 500 |*`CUDA_ERROR_NOT_FOUND`* |*`hipErrorNotFound`* | This indicates that a named symbol was not found. Examples of symbols are global/constant variable names, texture names, and surface names. | +| 600 |*`CUDA_ERROR_NOT_READY`* |*`hipErrorNotReady`* | This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than CUDA_SUCCESS (which indicates completion). Calls that may return this value include cuEventQuery() and cuStreamQuery(). | +| 700 |*`CUDA_ERROR_ILLEGAL_ADDRESS`* |*`hipErrorIllegalAddress`* | While executing a kernel, the device encountered a load or store instruction on an invalid memory address. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | + + +## **2. Error Handling** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **3. Initialization** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **4. Version Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **5. Device Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + +## **6. Device Management [DEPRECATED]** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **7. Primary Context Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **8. Context Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **9. Context Management [DEPRECATED]** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **10. Module Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **11. Memory Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **12. Unified Addressing** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **13. Stream Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **14. Event Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **15. Execution Control** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **16. Execution Control [DEPRECATED]** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **17. Occupancy** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **18. Texture Reference Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **19. Texture Reference Management [DEPRECATED]** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **20. Surface Reference Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **21. Texture Object Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **22. Surface Object Management** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **23. Peer Context Memory Access** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **24. Graphics Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **25. Profiler Control** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **26. OpenGL Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **27. Direct3D 9 Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **28. Direct3D 10 Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **29. Direct3D 11 Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + + +## **30. VDPAU Interoperability** + +| **CUDA** | **HIP** | **CUDA description** | +|-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| + diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 383af0440c..5a2940322e 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -81,6 +81,7 @@ enum ConvTypes { CONV_GL, CONV_GRAPHICS, CONV_SURFACE, + CONV_JIT, CONV_OTHER, CONV_INCLUDE, CONV_INCLUDE_CUDA_MAIN_H, @@ -94,7 +95,7 @@ const char *counterNames[CONV_LAST] = { "driver", "dev", "mem", "kern", "coord_func", "math_func", "special_func", "stream", "event", "occupancy", "ctx", "module", "cache", "exec", "err", "def", "tex", "gl", - "graphics", "surface", "other", "include", "include_cuda_main_header", + "graphics", "surface", "jit", "other", "include", "include_cuda_main_header", "type", "literal", "numeric_literal"}; enum ApiTypes { @@ -190,24 +191,23 @@ struct cuda2hipMap { // Error codes and return types cuda2hipRename["CUresult"] = {"hipError_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["cudaError_enum"] = {"hipError_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["cudaError_t"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; cuda2hipRename["cudaError"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; - // CUDA Driver API error code only - cuda2hipRename["CUDA_ERROR_INVALID_CONTEXT"] = {"hipErrorInvalidContext", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_CURRENT"] = {"hipErrorContextAlreadyCurrent", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_MAP_FAILED"] = {"hipErrorMapFailed", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_UNMAP_FAILED"] = {"hipErrorUnmapFailed", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_ARRAY_IS_MAPPED"] = {"hipErrorArrayIsMapped", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_ALREADY_MAPPED"] = {"hipErrorAlreadyMapped", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_ALREADY_ACQUIRED"] = {"hipErrorAlreadyAcquired", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_NOT_MAPPED"] = {"hipErrorNotMapped", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_ARRAY"] = {"hipErrorNotMappedAsArray", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_POINTER"] = {"hipErrorNotMappedAsPointer", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_IN_USE"] = {"hipErrorContextAlreadyInUse", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; - cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; + // CUDA Driver API error codes only + cuda2hipRename["CUDA_ERROR_INVALID_CONTEXT"] = {"hipErrorInvalidContext", CONV_ERR, API_DRIVER}; // 201 + cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_CURRENT"] = {"hipErrorContextAlreadyCurrent", CONV_ERR, API_DRIVER}; // 202 + cuda2hipRename["CUDA_ERROR_ARRAY_IS_MAPPED"] = {"hipErrorArrayIsMapped", CONV_ERR, API_DRIVER}; // 207 + cuda2hipRename["CUDA_ERROR_ALREADY_MAPPED"] = {"hipErrorAlreadyMapped", CONV_ERR, API_DRIVER}; // 208 + cuda2hipRename["CUDA_ERROR_ALREADY_ACQUIRED"] = {"hipErrorAlreadyAcquired", CONV_ERR, API_DRIVER}; // 210 + cuda2hipRename["CUDA_ERROR_NOT_MAPPED"] = {"hipErrorNotMapped", CONV_ERR, API_DRIVER}; // 211 + cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_ARRAY"] = {"hipErrorNotMappedAsArray", CONV_ERR, API_DRIVER}; // 212 + cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_POINTER"] = {"hipErrorNotMappedAsPointer", CONV_ERR, API_DRIVER}; // 213 + cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_IN_USE"] = {"hipErrorContextAlreadyInUse", CONV_ERR, API_DRIVER}; // 216 + cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 + cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 + cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 // CUDA RT API error code only cuda2hipRename["cudaErrorMissingConfiguration"] = {"hipErrorMissingConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 1 @@ -216,8 +216,6 @@ struct cuda2hipMap { cuda2hipRename["cudaErrorInvalidConfiguration"] = {"hipErrorInvalidConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 9 cuda2hipRename["cudaErrorInvalidPitchValue"] = {"hipErrorInvalidPitchValue", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 12 cuda2hipRename["cudaErrorInvalidSymbol"] = {"hipErrorInvalidSymbol", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 13 - cuda2hipRename["cudaErrorMapBufferObjectFailed"] = {"hipErrorMapBufferObjectFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 14 - cuda2hipRename["cudaErrorUnmapBufferObjectFailed"] = {"hipErrorUnmapBufferObjectFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 15 cuda2hipRename["cudaErrorInvalidHostPointer"] = {"hipErrorInvalidHostPointer", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 16 cuda2hipRename["cudaErrorInvalidDevicePointer"] = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; // 17 cuda2hipRename["cudaErrorInvalidTexture"] = {"hipErrorInvalidTexture", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 18 @@ -262,17 +260,96 @@ struct cuda2hipMap { // Deprecated as of CUDA 4.1 cuda2hipRename["cudaErrorApiFailureBase"] = {"hipErrorApiFailureBase", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 10000 - cuda2hipRename["CUDA_SUCCESS"] = {"hipSuccess", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaSuccess"] = {"hipSuccess", CONV_ERR, API_RUNTIME}; // 0 - cuda2hipRename["CUDA_ERROR_OUT_OF_MEMORY"] = {"hipErrorMemoryAllocation", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorMemoryAllocation"] = {"hipErrorMemoryAllocation", CONV_ERR, API_RUNTIME}; // 2 - cuda2hipRename["CUDA_ERROR_NOT_INITIALIZED"] = {"hipErrorNotInitialized", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInitializationError", CONV_ERR, API_RUNTIME}; // 3 + cuda2hipRename["CUDA_SUCCESS"] = {"hipSuccess", CONV_ERR, API_DRIVER}; // 0 + cuda2hipRename["cudaSuccess"] = {"hipSuccess", CONV_ERR, API_RUNTIME}; // 0 - cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"] = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorLaunchFailure"] = {"hipErrorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 4 + cuda2hipRename["CUDA_ERROR_INVALID_VALUE"] = {"hipErrorInvalidValue", CONV_ERR, API_DRIVER}; // 1 + cuda2hipRename["cudaErrorInvalidValue"] = {"hipErrorInvalidValue", CONV_ERR, API_RUNTIME}; // 11 + + cuda2hipRename["CUDA_ERROR_OUT_OF_MEMORY"] = {"hipErrorMemoryAllocation", CONV_ERR, API_DRIVER}; // 2 + cuda2hipRename["cudaErrorMemoryAllocation"] = {"hipErrorMemoryAllocation", CONV_ERR, API_RUNTIME}; // 2 + + cuda2hipRename["CUDA_ERROR_NOT_INITIALIZED"] = {"hipErrorNotInitialized", CONV_ERR, API_DRIVER}; // 3 + cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInitializationError", CONV_ERR, API_RUNTIME}; // 3 + + cuda2hipRename["CUDA_ERROR_DEINITIALIZED"] = {"hipErrorDeinitialized", CONV_ERR, API_DRIVER}; // 4 + // TODO: double check, that these errors match + cuda2hipRename["cudaErrorCudartUnloading"] = {"hipErrorDeinitialized", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 29 + + cuda2hipRename["CUDA_ERROR_PROFILER_DISABLED"] = {"hipErrorProfilerDisabled", CONV_ERR, API_DRIVER}; // 5 + cuda2hipRename["cudaErrorProfilerDisabled"] = {"hipErrorProfilerDisabled", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 55 + + cuda2hipRename["CUDA_ERROR_PROFILER_NOT_INITIALIZED"] = {"hipErrorProfilerNotInitialized", CONV_ERR, API_DRIVER}; // 6 + // Deprecated as of CUDA 5.0 + cuda2hipRename["cudaErrorProfilerNotInitialized"] = {"hipErrorProfilerNotInitialized", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 56 + + cuda2hipRename["CUDA_ERROR_PROFILER_ALREADY_STARTED"] = {"hipErrorProfilerAlreadyStarted", CONV_ERR, API_DRIVER}; // 7 + // Deprecated as of CUDA 5.0 + cuda2hipRename["cudaErrorProfilerAlreadyStarted"] = {"hipErrorProfilerAlreadyStarted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 57 + + cuda2hipRename["CUDA_ERROR_PROFILER_ALREADY_STOPPED"] = {"hipErrorProfilerAlreadyStopped", CONV_ERR, API_DRIVER}; // 8 + // Deprecated as of CUDA 5.0 + cuda2hipRename["cudaErrorProfilerAlreadyStopped"] = {"hipErrorProfilerAlreadyStopped", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 58 + + cuda2hipRename["CUDA_ERROR_NO_DEVICE"] = {"hipErrorNoDevice", CONV_ERR, API_DRIVER}; // 100 + cuda2hipRename["cudaErrorNoDevice"] = {"hipErrorNoDevice", CONV_ERR, API_RUNTIME}; // 38 + + cuda2hipRename["CUDA_ERROR_INVALID_DEVICE"] = {"hipErrorInvalidDevice", CONV_ERR, API_DRIVER}; // 101 + cuda2hipRename["cudaErrorInvalidDevice"] = {"hipErrorInvalidDevice", CONV_ERR, API_RUNTIME}; // 10 + + cuda2hipRename["CUDA_ERROR_INVALID_IMAGE"] = {"hipErrorInvalidImage", CONV_ERR, API_DRIVER}; // 200 + cuda2hipRename["cudaErrorInvalidKernelImage"] = {"hipErrorInvalidImage", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 47 + + cuda2hipRename["CUDA_ERROR_MAP_FAILED"] = {"hipErrorMapFailed", CONV_ERR, API_DRIVER}; // 205 + // TODO: double check, that these errors match + cuda2hipRename["cudaErrorMapBufferObjectFailed"] = {"hipErrorMapFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 14 + + cuda2hipRename["CUDA_ERROR_UNMAP_FAILED"] = {"hipErrorUnmapFailed", CONV_ERR, API_DRIVER}; // 206 + // TODO: double check, that these errors match + cuda2hipRename["cudaErrorUnmapBufferObjectFailed"] = {"hipErrorUnmapFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 15 + + cuda2hipRename["CUDA_ERROR_NO_BINARY_FOR_GPU"] = {"hipErrorNoBinaryForGpu", CONV_ERR, API_DRIVER}; // 209 + cuda2hipRename["cudaErrorNoKernelImageForDevice"] = {"hipErrorNoBinaryForGpu", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 48 + + cuda2hipRename["CUDA_ERROR_ECC_UNCORRECTABLE"] = {"hipErrorECCNotCorrectable", CONV_ERR, API_DRIVER}; // 214 + cuda2hipRename["cudaErrorECCUncorrectable"] = {"hipErrorECCNotCorrectable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 39 + + cuda2hipRename["CUDA_ERROR_UNSUPPORTED_LIMIT"] = {"hipErrorUnsupportedLimit", CONV_ERR, API_DRIVER}; // 215 + cuda2hipRename["cudaErrorUnsupportedLimit"] = {"hipErrorUnsupportedLimit", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 42 + + cuda2hipRename["CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"] = {"hipErrorPeerAccessUnsupported", CONV_ERR, API_DRIVER}; // 217 + cuda2hipRename["cudaErrorPeerAccessUnsupported"] = {"hipErrorPeerAccessUnsupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 64 + + cuda2hipRename["CUDA_ERROR_INVALID_PTX"] = {"hipErrorInvalidKernelFile", CONV_ERR, API_DRIVER}; // 218 + cuda2hipRename["cudaErrorInvalidPtx"] = {"hipErrorInvalidKernelFile", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 78 + + cuda2hipRename["CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_DRIVER}; // 219 + cuda2hipRename["cudaErrorInvalidGraphicsContext"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 79 + + cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_DRIVER}; // 302 + cuda2hipRename["cudaErrorSharedObjectSymbolNotFound"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 40 + + cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"] = {"hipErrorSharedObjectInitFailed", CONV_ERR, API_DRIVER}; // 303 + cuda2hipRename["cudaErrorSharedObjectInitFailed"] = {"hipErrorSharedObjectInitFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 41 + + cuda2hipRename["CUDA_ERROR_OPERATING_SYSTEM"] = {"hipErrorOperatingSystem", CONV_ERR, API_DRIVER}; // 304 + cuda2hipRename["cudaErrorOperatingSystem"] = {"hipErrorOperatingSystem", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 63 + + cuda2hipRename["CUDA_ERROR_INVALID_HANDLE"] = {"hipErrorInvalidResourceHandle", CONV_ERR, API_DRIVER}; // 400 + cuda2hipRename["cudaErrorInvalidResourceHandle"] = {"hipErrorInvalidResourceHandle", CONV_ERR, API_RUNTIME}; // 33 + + cuda2hipRename["CUDA_ERROR_NOT_READY"] = {"hipErrorNotReady", CONV_ERR, API_DRIVER}; // 600 + cuda2hipRename["cudaErrorNotReady"] = {"hipErrorNotReady", CONV_ERR, API_RUNTIME}; // 34 + + cuda2hipRename["CUDA_ERROR_ILLEGAL_ADDRESS"] = {"hipErrorIllegalAddress", CONV_ERR, API_DRIVER}; // 700 + cuda2hipRename["cudaErrorIllegalAddress"] = {"hipErrorIllegalAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 77 + + ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"] = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER}; // 719 + cuda2hipRename["cudaErrorLaunchFailure"] = {"hipErrorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 4 cuda2hipRename["CUDA_ERROR_LAUNCH_TIMEOUT"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorLaunchTimeout"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 6 @@ -280,94 +357,81 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorLaunchOutOfResources"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_RUNTIME}; // 7 - cuda2hipRename["CUDA_ERROR_INVALID_DEVICE"] = {"hipErrorInvalidDevice", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidDevice"] = {"hipErrorInvalidDevice", CONV_ERR, API_RUNTIME}; // 10 - - cuda2hipRename["CUDA_ERROR_INVALID_VALUE"] = {"hipErrorInvalidValue", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidValue"] = {"hipErrorInvalidValue", CONV_ERR, API_RUNTIME}; // 11 - - cuda2hipRename["CUDA_ERROR_DEINITIALIZED"] = {"hipErrorDeinitialized", CONV_ERR, API_DRIVER}; - // TODO: double check, that this error matches to hipErrorDeinitialized - cuda2hipRename["cudaErrorCudartUnloading"] = {"hipErrorDeinitialized", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 29 - cuda2hipRename["CUDA_ERROR_UNKNOWN"] = {"hipErrorUnknown", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30 - cuda2hipRename["CUDA_ERROR_INVALID_HANDLE"] = {"hipErrorInvalidResourceHandle", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidResourceHandle"] = {"hipErrorInvalidResourceHandle", CONV_ERR, API_RUNTIME}; // 33 - // cuda2hipRename["CUDA_ERROR_NOT_INITIALIZED"] = {"hipErrorInitializationError", CONV_ERR, API_DRIVER}; // cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInitializationError", CONV_ERR, API_RUNTIME}; - cuda2hipRename["CUDA_ERROR_NOT_READY"] = {"hipErrorNotReady", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorNotReady"] = {"hipErrorNotReady", CONV_ERR, API_RUNTIME}; // 34 - - cuda2hipRename["CUDA_ERROR_NO_DEVICE"] = {"hipErrorNoDevice", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorNoDevice"] = {"hipErrorNoDevice", CONV_ERR, API_RUNTIME}; // 38 - - cuda2hipRename["CUDA_ERROR_ECC_UNCORRECTABLE"] = {"hipErrorECCNotCorrectable", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorECCUncorrectable"] = {"hipErrorECCNotCorrectable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 39 - - cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorSharedObjectSymbolNotFound"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 40 - - cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"] = {"hipErrorSharedObjectInitFailed", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorSharedObjectInitFailed"] = {"hipErrorSharedObjectInitFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 41 - - cuda2hipRename["CUDA_ERROR_UNSUPPORTED_LIMIT"] = {"hipErrorUnsupportedLimit", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorUnsupportedLimit"] = {"hipErrorUnsupportedLimit", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 42 - - cuda2hipRename["CUDA_ERROR_INVALID_IMAGE"] = {"hipErrorInvalidImage", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidKernelImage"] = {"hipErrorInvalidImage", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 47 - - cuda2hipRename["CUDA_ERROR_NO_BINARY_FOR_GPU"] = {"hipErrorNoBinaryForGpu", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorNoKernelImageForDevice"] = {"hipErrorNoBinaryForGpu", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 48 - cuda2hipRename["CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME}; // 50 cuda2hipRename["CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorPeerAccessNotEnabled"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME}; // 51 - cuda2hipRename["CUDA_ERROR_PROFILER_DISABLED"] = {"hipErrorProfilerDisabled", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorProfilerDisabled"] = {"hipErrorProfilerDisabled", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 55 - - cuda2hipRename["CUDA_ERROR_PROFILER_NOT_INITIALIZED"] = {"hipErrorProfilerNotInitialized", CONV_ERR, API_DRIVER}; - // Deprecated as of CUDA 5.0 - cuda2hipRename["cudaErrorProfilerNotInitialized"] = {"hipErrorProfilerNotInitialized", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 56 - - cuda2hipRename["CUDA_ERROR_PROFILER_ALREADY_STARTED"] = {"hipErrorProfilerAlreadyStarted", CONV_ERR, API_DRIVER}; - // Deprecated as of CUDA 5.0 - cuda2hipRename["cudaErrorProfilerAlreadyStarted"] = {"hipErrorProfilerAlreadyStarted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 57 - - cuda2hipRename["CUDA_ERROR_PROFILER_ALREADY_STOPPED"] = {"hipErrorProfilerAlreadyStopped", CONV_ERR, API_DRIVER}; - // Deprecated as of CUDA 5.0 - cuda2hipRename["cudaErrorProfilerAlreadyStopped"] = {"hipErrorProfilerAlreadyStopped", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 58 - cuda2hipRename["CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME}; // 61 cuda2hipRename["CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_DRIVER}; cuda2hipRename["cudaErrorHostMemoryNotRegistered"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME}; // 62 - cuda2hipRename["CUDA_ERROR_OPERATING_SYSTEM"] = {"hipErrorOperatingSystem", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorOperatingSystem"] = {"hipErrorOperatingSystem", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 63 - - cuda2hipRename["CUDA_ERROR_PEER_ACCESS_UNSUPPORTED"] = {"hipErrorPeerAccessUnsupported", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorPeerAccessUnsupported"] = {"hipErrorPeerAccessUnsupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 64 - - cuda2hipRename["CUDA_ERROR_ILLEGAL_ADDRESS"] = {"hipErrorIllegalAddress", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorIllegalAddress"] = {"hipErrorIllegalAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 77 - - cuda2hipRename["CUDA_ERROR_INVALID_PTX"] = {"hipErrorInvalidKernelFile", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidPtx"] = {"hipErrorInvalidKernelFile", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 78 - - cuda2hipRename["CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorInvalidGraphicsContext"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 79 - - - ///////////////////////////// CUDA DRIVER API ///////////////////////////// + // enums + cuda2hipRename["CUDA_ARRAY3D_DESCRIPTOR"] = {"HIP_ARRAY3D_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_ARRAY_DESCRIPTOR"] = {"HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_MEMCPY2D"] = {"HIP_MEMCPY2D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_MEMCPY3D"] = {"HIP_MEMCPY3D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_MEMCPY3D_PEER"] = {"HIP_MEMCPY3D_PEER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_POINTER_ATTRIBUTE_P2P_TOKENS"] = {"HIP_POINTER_ATTRIBUTE_P2P_TOKENS", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_RESOURCE_DESC"] = {"HIP_RESOURCE_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUDA_RESOURCE_VIEW_DESC"] = {"HIP_RESOURCE_VIEW_DESC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + + cuda2hipRename["CUipcEventHandle"] = {"hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUipcMemHandle"] = {"hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + + + + cuda2hipRename["CUaddress_mode"] = {"hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TR_ADDRESS_MODE_WRAP"] = {"HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0 + cuda2hipRename["CU_TR_ADDRESS_MODE_CLAMP"] = {"HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_TR_ADDRESS_MODE_MIRROR"] = {"HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_TR_ADDRESS_MODE_BORDER"] = {"HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + + cuda2hipRename["CUarray_cubemap_face"] = {"hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_X"] = {"HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_X"] = {"HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Y"] = {"HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Y"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Z"] = {"HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Z"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x05 + + cuda2hipRename["CUarray_format"] = {"hipArray_format", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT8"] = {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT16"] = {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT32"] = {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT8"] = {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT16"] = {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x09 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT32"] = {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a + cuda2hipRename["CU_AD_FORMAT_HALF"] = {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_AD_FORMAT_FLOAT"] = {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 + // Compute mode + cuda2hipRename["CUcomputemode"] = {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) + cuda2hipRename["CU_COMPUTEMODE_DEFAULT"] = {"hipComputeModeDefault", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0 + cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 + // Context flags + cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 + cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f + // Defines cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_SIZE"] = {"HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_DEV, API_DRIVER}; @@ -375,104 +439,108 @@ struct cuda2hipMap { // Types // NOTE: CUdevice might be changed to typedef int in the future. - cuda2hipRename["CUdevice"] = {"hipDevice_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; - - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER}; + cuda2hipRename["CUdevice"] = {"hipDevice_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) // unsupported yet by HIP - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimY = 3) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimZ = 4) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimX =5) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimY = 6) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimZ - 7) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerBlock = 8) + // Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 9 // API_Runtime ANALOGUE (cudaDevAttrTotalConstantMemory = 9) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 10 // API_Runtime ANALOGUE (cudaDevAttrWarpSize = 10) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 11 // API_Runtime ANALOGUE (cudaDevAttrMaxPitch = 11) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 12) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 13 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 13) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 14 // API_Runtime ANALOGUE (cudaDevAttrTextureAlignment = 14) // Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 15 // API_Runtime ANALOGUE (cudaDevAttrGpuOverlap = 15) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 16 // API_Runtime ANALOGUE (cudaDevAttrMultiProcessorCount = 16) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 17 // API_Runtime ANALOGUE (cudaDevAttrKernelExecTimeout = 17) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 18 // API_Runtime ANALOGUE (cudaDevAttrIntegrated = 18) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 19 // API_Runtime ANALOGUE (cudaDevAttrCanMapHostMemory = 19) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER}; // 20 // API_Runtime ANALOGUE (cudaDevAttrComputeMode = 20) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 21 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DWidth = 21) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 22 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DWidth = 22) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 23 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DHeight = 23) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 24 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidth = 24) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 25 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeight = 25) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 26 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepth = 26) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) + // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) + // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) + // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 30 // API_Runtime ANALOGUE (cudaDevAttrSurfaceAlignment = 30) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER}; // 31 // API_Runtime ANALOGUE (cudaDevAttrConcurrentKernels = 31) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 32 // API_Runtime ANALOGUE (cudaDevAttrEccEnabled = 32) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER}; // 33 // API_Runtime ANALOGUE (cudaDevAttrPciBusId = 33) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER}; // 34 // API_Runtime ANALOGUE (cudaDevAttrPciDeviceId = 34) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 35 // API_Runtime ANALOGUE (cudaDevAttrTccDriver = 35) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 36 // API_Runtime ANALOGUE (cudaDevAttrMemoryClockRate = 36) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER}; // 37 // API_Runtime ANALOGUE (cudaDevAttrGlobalMemoryBusWidth = 37) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER}; // 38 // API_Runtime ANALOGUE (cudaDevAttrL2CacheSize = 38) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER}; // 39 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerMultiProcessor = 39) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 40 // API_Runtime ANALOGUE (cudaDevAttrAsyncEngineCount = 40) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 41 // API_Runtime ANALOGUE (cudaDevAttrUnifiedAddressing = 41) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 42 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredWidth = 42) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 43 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredLayers = 43) // deprecated, do not use - // cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 44 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 45 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherWidth = 45) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 46 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherHeight = 46) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 47 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidthAlt = 47) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 48 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeightAlt = 48) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 49 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepthAlt = 49) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 50 // API_Runtime ANALOGUE (cudaDevAttrPciDomainId = 50) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 51 // API_Runtime ANALOGUE (cudaDevAttrTexturePitchAlignment = 51) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 52 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapWidth = 52) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 53 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredWidth = 53) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 54 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredLayers = 54) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 55 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DWidth = 55) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 56 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DWidth = 56) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 57 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DHeight = 57) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 58 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DWidth = 58) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 59 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DHeight = 59) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 60 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DDepth = 60) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 61 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredWidth = 61) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 62 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredLayers = 62) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 63 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredWidth = 63) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredHeight = 64) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 65 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredLayers = 65) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 66 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapWidth = 66) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 67 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 68 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 69 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLinearWidth = 69) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 70 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearWidth = 70) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 71 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearHeight = 71) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 72 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearPitch = 72) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 73 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedWidth = 73) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 74 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedHeight = 74) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER}; // 75 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMajor = 75) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER}; // 76 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMinor = 76) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 77 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DMipmappedWidth = 77) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 78 // API_Runtime ANALOGUE (cudaDevAttrStreamPrioritiesSupported = 78) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 79 // API_Runtime ANALOGUE (cudaDevAttrGlobalL1CacheSupported = 79) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 80 // API_Runtime ANALOGUE (cudaDevAttrLocalL1CacheSupported = 80) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER}; // 81 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 82 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerMultiprocessor = 82) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 83 // API_Runtime ANALOGUE (cudaDevAttrManagedMemory = 83) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER}; // 84 // API_Runtime ANALOGUE (cudaDevAttrIsMultiGpuBoard = 84) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 85 // API_Runtime ANALOGUE (cudaDevAttrMultiGpuBoardGroupID = 85) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 86 // API_Runtime ANALOGUE (no) + // unsupported yet by HIP [CUDA 8.0.44] cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; @@ -481,52 +549,232 @@ struct cuda2hipMap { cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CUdevprop_st"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUdevprop"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUdevprop_st"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUdevprop"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; // TODO: Analogues enum is needed in HIP. Couldn't map enum to struct hipPointerAttribute_t. // TODO: Do for Pointer Attributes the same as for Device Attributes. - // cuda2hipRename["CUpointer_attribute_enum"] = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER}; - // cuda2hipRename["CUpointer_attribute"] = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER}; + // cuda2hipRename["CUpointer_attribute_enum"] = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + // cuda2hipRename["CUpointer_attribute"] = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_CONTEXT"] = {"hipPointerAttributeContext", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_MEMORY_TYPE"] = {"hipPointerAttributeMemoryType", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_DEVICE_POINTER"] = {"hipPointerAttributeDevicePointer", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_HOST_POINTER"] = {"hipPointerAttributeHostPointer", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_P2P_TOKENS"] = {"hipPointerAttributeP2pTokens", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_SYNC_MEMOPS"] = {"hipPointerAttributeSyncMemops", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_BUFFER_ID"] = {"hipPointerAttributeBufferId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_POINTER_ATTRIBUTE_IS_MANAGED"] = {"hipPointerAttributeIsManaged", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (no) + // pointer to CUfunc_st - cuda2hipRename["CUfunction"] = {"hipFunction_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUfunction"] = {"hipFunction_t", CONV_TYPE, API_DRIVER}; // TODO: in HIP ihipModuleSymbol_t should be declared in hip_runtime_api.h, not in hcc_detail/hip_runtime_api.h, as it's analogue CUfunc_st is declared also in cuda.h // ToDO: examples are needed with CUfunc_st - // cuda2hipRename["CUfunc_st"] = {"ihipModuleSymbol_t", CONV_TYPE, API_DRIVER}; + // cuda2hipRename["CUfunc_st"] = {"ihipModuleSymbol_t", CONV_TYPE, API_DRIVER}; // unsupported yet by HIP - cuda2hipRename["CUfunction_attribute_enum"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CUfunction_attribute"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUfunction_attribute"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUfunction_attribute_enum"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipFuncAttributeMaxThreadsPerBlocks", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES"] = {"hipFuncAttributeSharedSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES"] = {"hipFuncAttributeConstSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES"] = {"hipFuncAttributeLocalSizeBytes", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_NUM_REGS"] = {"hipFuncAttributeNumRegs", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_PTX_VERSION"] = {"hipFuncAttributePtxVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_BINARY_VERSION"] = {"hipFuncAttributeBinaryVersion", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_CACHE_MODE_CA"] = {"hipFuncAttributeCacheModeCA", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_FUNC_ATTRIBUTE_MAX"] = {"hipFuncAttributeMax", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CUfunc_cache_enum"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUfunc_cache"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CU_FUNC_CACHE_PREFER_NONE"] = {"hipFuncCachePreferNone", CONV_CACHE, API_DRIVER}; - cuda2hipRename["CU_FUNC_CACHE_PREFER_SHARED"] = {"hipFuncCachePreferShared", CONV_CACHE, API_DRIVER}; - cuda2hipRename["CU_FUNC_CACHE_PREFER_L1"] = {"hipFuncCachePreferL1", CONV_CACHE, API_DRIVER}; - cuda2hipRename["CU_FUNC_CACHE_PREFER_EQUAL"] = {"hipFuncCachePreferEqual", CONV_CACHE, API_DRIVER}; + // enum CUgraphicsMapResourceFlags/CUgraphicsMapResourceFlags_enum + cuda2hipRename["CUgraphicsMapResourceFlags"] = {"hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaGraphicsMapFlags) + cuda2hipRename["CUgraphicsMapResourceFlags_enum"] = {"hipGraphicsMapFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaGraphicsMapFlags) + cuda2hipRename["CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE"] = {"hipGraphicsMapFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaGraphicsMapFlagsNone = 0) + cuda2hipRename["CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY"] = {"hipGraphicsMapFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaGraphicsMapFlagsReadOnly = 1) + cuda2hipRename["CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD"] = {"hipGraphicsMapFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaGraphicsMapFlagsWriteDiscard = 2) - cuda2hipRename["CUsharedconfig_enum"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUsharedconfig"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE"] = {"hipSharedMemBankSizeDefault", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_DRIVER}; + // enum CUgraphicsRegisterFlags/CUgraphicsRegisterFlags_enum + cuda2hipRename["CUgraphicsRegisterFlags"] = {"hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaGraphicsRegisterFlags) + cuda2hipRename["CUgraphicsRegisterFlags_enum"] = {"hipGraphicsRegisterFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaGraphicsRegisterFlags) + cuda2hipRename["CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE"] = {"hipGraphicsRegisterFlagsNone", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaGraphicsRegisterFlagsNone = 0) + cuda2hipRename["CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY"] = {"hipGraphicsRegisterFlagsReadOnly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaGraphicsRegisterFlagsReadOnly = 1) + cuda2hipRename["CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD"] = {"hipGraphicsRegisterFlagsWriteDiscard", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaGraphicsRegisterFlagsWriteDiscard = 2) + cuda2hipRename["CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST"] = {"hipGraphicsRegisterFlagsSurfaceLoadStore", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 // API_Runtime ANALOGUE (cudaGraphicsRegisterFlagsSurfaceLoadStore = 4) + cuda2hipRename["CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER"] = {"hipGraphicsRegisterFlagsTextureGather", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 // API_Runtime ANALOGUE (cudaGraphicsRegisterFlagsTextureGather = 8) - cuda2hipRename["CUcontext"] = {"hipCtx_t", CONV_TYPE, API_DRIVER}; + // enum CUoccupancy_flags/CUoccupancy_flags_enum + cuda2hipRename["CUoccupancy_flags"] = {"hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUoccupancy_flags_enum"] = {"hipOccupancyFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_OCCUPANCY_DEFAULT"] = {"hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaOccupancyDefault = 0x0) + cuda2hipRename["CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE"] = {"hipOccupancyDisableCachingOverride", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaOccupancyDisableCachingOverride = 0x1) + + + + cuda2hipRename["CUfunc_cache_enum"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaFuncCache) + cuda2hipRename["CUfunc_cache"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaFuncCache) + cuda2hipRename["CU_FUNC_CACHE_PREFER_NONE"] = {"hipFuncCachePreferNone", CONV_CACHE, API_DRIVER}; // 0x00 // API_Runtime ANALOGUE (cudaFilterModePoint = 0) + cuda2hipRename["CU_FUNC_CACHE_PREFER_SHARED"] = {"hipFuncCachePreferShared", CONV_CACHE, API_DRIVER}; // 0x01 // API_Runtime ANALOGUE (cudaFuncCachePreferShared = 1) + cuda2hipRename["CU_FUNC_CACHE_PREFER_L1"] = {"hipFuncCachePreferL1", CONV_CACHE, API_DRIVER}; // 0x02 // API_Runtime ANALOGUE (cudaFuncCachePreferL1 = 2) + cuda2hipRename["CU_FUNC_CACHE_PREFER_EQUAL"] = {"hipFuncCachePreferEqual", CONV_CACHE, API_DRIVER}; // 0x03 // API_Runtime ANALOGUE (cudaFuncCachePreferEqual = 3) + + // enum CUipcMem_flags/CUipcMem_flags_enum + cuda2hipRename["CUipcMem_flags"] = {"hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUipcMem_flags_enum"] = {"hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS"] = {"hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1 // API_Runtime ANALOGUE (cudaIpcMemLazyEnablePeerAccess = 0x01) + + // enum CUipcMem_flags/CUipcMem_flags_enum + cuda2hipRename["CUipcMem_flags"] = {"hipIpcMemFlags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + + // JIT + // enum CUjit_cacheMode/CUjit_cacheMode_enum + cuda2hipRename["CUjit_cacheMode"] = {"hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_cacheMode_enum"] = {"hipJitCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_CACHE_OPTION_NONE"] = {"hipJitCacheModeOptionNone", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_CACHE_OPTION_CG"] = {"hipJitCacheModeOptionCG", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_CACHE_OPTION_CA"] = {"hipJitCacheModeOptionCA", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // enum CUjit_fallback/CUjit_fallback_enum + cuda2hipRename["CUjit_fallback"] = {"hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_fallback_enum"] = {"hipJitFallback", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_PREFER_PTX"] = {"hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_PREFER_BINARY"] = {"hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // enum CUjit_option/CUjit_option_enum + cuda2hipRename["CUjit_option"] = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_option_enum"] = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_MAX_REGISTERS"] = {"hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_THREADS_PER_BLOCK"] = {"hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_WALL_TIME"] = {"hipJitOptionWallTime", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INFO_LOG_BUFFER"] = {"hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER"] = {"hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_OPTIMIZATION_LEVEL"] = {"hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_TARGET_FROM_CUCONTEXT"] = {"hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_TARGET"] = {"hipJitOptionTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_FALLBACK_STRATEGY"] = {"hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_GENERATE_DEBUG_INFO"] = {"hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // enum CUjit_target/CUjit_target_enum + cuda2hipRename["CUjit_target"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_target_enum"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_10"] = {"hipJitTargetCompute10", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_11"] = {"hipJitTargetCompute11", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_12"] = {"hipJitTargetCompute12", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_13"] = {"hipJitTargetCompute13", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_20"] = {"hipJitTargetCompute20", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_21"] = {"hipJitTargetCompute21", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_30"] = {"hipJitTargetCompute30", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_32"] = {"hipJitTargetCompute32", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_35"] = {"hipJitTargetCompute35", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_37"] = {"hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_50"] = {"hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_52"] = {"hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // enum CUjitInputType/CUjitInputType_enum + cuda2hipRename["CUjitInputType"] = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjitInputType_enum"] = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INPUT_CUBIN"] = {"hipJitInputTypeBin", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INPUT_PTX"] = {"hipJitInputTypePtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INPUT_FATBINARY"] = {"hipJitInputTypeFatBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INPUT_OBJECT"] = {"hipJitInputTypeObject", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_INPUT_LIBRARY"] = {"hipJitInputTypeLibrary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_NUM_INPUT_TYPES"] = {"hipJitInputTypeNumInputTypes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + + // Limits + cuda2hipRename["CUlimit"] = {"hipLimit_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaLimit) + cuda2hipRename["CUlimit_enum"] = {"hipLimit_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaLimit) + cuda2hipRename["CU_LIMIT_STACK_SIZE"] = {"hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaLimitStackSize = 0x00) + cuda2hipRename["CU_LIMIT_PRINTF_FIFO_SIZE"] = {"hipLimitPrintfFifoSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaLimitPrintfFifoSize = 0x01) + cuda2hipRename["CU_LIMIT_MALLOC_HEAP_SIZE"] = {"hipLimitMallocHeapSize", CONV_TYPE, API_DRIVER}; // 0x02 // API_Runtime ANALOGUE (cudaLimitMallocHeapSize = 0x02) + cuda2hipRename["CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH"] = {"hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaLimitDevRuntimeSyncDepth = 0x03) + cuda2hipRename["CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT"] = {"hipLimitDevRuntimePendingLaunchCount", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 // API_Runtime ANALOGUE (cudaLimitDevRuntimePendingLaunchCount = 0x04) + cuda2hipRename["CU_LIMIT_STACK_SIZE"] = {"hipLimitStackSize", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + + // enum CUmemAttach_flags/CUmemAttach_flags_enum + cuda2hipRename["CUmemAttach_flags"] = {"hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUmemAttach_flags_enum"] = {"hipMemAttachFlags_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_MEM_ATTACH_GLOBAL"] = {"hipMemAttachGlobal", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1 // API_Runtime ANALOGUE (#define cudaMemAttachGlobal 0x01) + cuda2hipRename["CU_MEM_ATTACH_HOST"] = {"hipMemAttachHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x2 // API_Runtime ANALOGUE (#define cudaMemAttachHost 0x02) + cuda2hipRename["CU_MEM_ATTACH_SINGLE"] = {"hipMemAttachSingle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x4 // API_Runtime ANALOGUE (#define cudaMemAttachSingle 0x04) + + // enum CUmemorytype/CUmemorytype_enum + cuda2hipRename["CUmemorytype"] = {"hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no - cudaMemoryType is not an analogue) + cuda2hipRename["CUmemorytype_enum"] = {"hipMemType_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no - cudaMemoryType is not an analogue) + cuda2hipRename["CU_MEMORYTYPE_HOST"] = {"hipMemTypeHost", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_MEMORYTYPE_DEVICE"] = {"hipMemTypeDevice", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_MEMORYTYPE_ARRAY"] = {"hipMemTypeArray", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_MEMORYTYPE_UNIFIED"] = {"hipMemTypeUnified", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 // API_Runtime ANALOGUE (no) + + // enum CUresourcetype + cuda2hipRename["CUresourcetype"] = {"hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaResourceType) + cuda2hipRename["CUresourcetype_enum"] = {"hipResourceType", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaResourceType) + cuda2hipRename["CU_RESOURCE_TYPE_ARRAY"] = {"hipResourceTypeArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaResourceTypeArray = 0x00) + cuda2hipRename["CU_RESOURCE_TYPE_MIPMAPPED_ARRAY"] = {"hipResourceTypeMipmappedArray", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaResourceTypeMipmappedArray = 0x01) + cuda2hipRename["CU_RESOURCE_TYPE_LINEAR"] = {"hipResourceTypeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaResourceTypeLinear = 0x02) + cuda2hipRename["CU_RESOURCE_TYPE_PITCH2D"] = {"hipResourceTypePitch2D", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaResourceTypePitch2D = 0x03) + + // enum CUresourceViewFormat/CUresourceViewFormat_enum + cuda2hipRename["CUresourceViewFormat"] = {"hipResourceViewFormat", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaResourceViewFormat) + cuda2hipRename["CUresourceViewFormat_enum"] = {"hipResourceViewFormat", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaResourceViewFormat) + cuda2hipRename["CU_RES_VIEW_FORMAT_NONE"] = {"hipResViewFormatNone", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaResViewFormatNone = 0x00) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_1X8"] = {"hipResViewFormatUnsignedChar1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedChar1 = 0x01) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_2X8"] = {"hipResViewFormatUnsignedChar2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedChar2 = 0x02) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_4X8"] = {"hipResViewFormatUnsignedChar4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedChar4 = 0x03) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_1X8"] = {"hipResViewFormatSignedChar1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 // API_Runtime ANALOGUE (cudaResViewFormatSignedChar1 = 0x04) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_2X8"] = {"hipResViewFormatSignedChar2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x05 // API_Runtime ANALOGUE (cudaResViewFormatSignedChar2 = 0x05) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_4X8"] = {"hipResViewFormatSignedChar4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x06 // API_Runtime ANALOGUE (cudaResViewFormatSignedChar4 = 0x06) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_1X16"] = {"hipResViewFormatUnsignedShort1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedShort1 = 0x07) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_2X16"] = {"hipResViewFormatUnsignedShort2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedShort2 = 0x08) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_4X16"] = {"hipResViewFormatUnsignedShort4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x09 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedShort4 = 0x09) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_1X16"] = {"hipResViewFormatSignedShort1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a // API_Runtime ANALOGUE (cudaResViewFormatSignedShort1 = 0x0a) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_2X16"] = {"hipResViewFormatSignedShort2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0b // API_Runtime ANALOGUE (cudaResViewFormatSignedShort2 = 0x0b) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_4X16"] = {"hipResViewFormatSignedShort4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0c // API_Runtime ANALOGUE (cudaResViewFormatSignedShort4 = 0x0c) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_1X32"] = {"hipResViewFormatUnsignedInt1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0d // API_Runtime ANALOGUE (cudaResViewFormatUnsignedInt1 = 0x0d) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_2X32"] = {"hipResViewFormatUnsignedInt2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0e // API_Runtime ANALOGUE (cudaResViewFormatUnsignedInt2 = 0x0e) + cuda2hipRename["CU_RES_VIEW_FORMAT_UINT_4X32"] = {"hipResViewFormatUnsignedInt4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x0f // API_Runtime ANALOGUE (cudaResViewFormatUnsignedInt4 = 0x0f) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_1X32"] = {"hipResViewFormatSignedInt1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 // API_Runtime ANALOGUE (cudaResViewFormatSignedInt1 = 0x10) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_2X32"] = {"hipResViewFormatSignedInt2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x11 // API_Runtime ANALOGUE (cudaResViewFormatSignedInt2 = 0x11) + cuda2hipRename["CU_RES_VIEW_FORMAT_SINT_4X32"] = {"hipResViewFormatSignedInt4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x12 // API_Runtime ANALOGUE (cudaResViewFormatSignedInt4 = 0x12) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_1X16"] = {"hipResViewFormatHalf1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x13 // API_Runtime ANALOGUE (cudaResViewFormatHalf1 = 0x13) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_2X16"] = {"hipResViewFormatHalf2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x14 // API_Runtime ANALOGUE (cudaResViewFormatHalf2 = 0x14) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_4X16"] = {"hipResViewFormatHalf4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x15 // API_Runtime ANALOGUE (cudaResViewFormatHalf4 = 0x15) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_1X32"] = {"hipResViewFormatFloat1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x16 // API_Runtime ANALOGUE (cudaResViewFormatFloat1 = 0x16) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_2X32"] = {"hipResViewFormatFloat2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x17 // API_Runtime ANALOGUE (cudaResViewFormatFloat2 = 0x17) + cuda2hipRename["CU_RES_VIEW_FORMAT_FLOAT_4X32"] = {"hipResViewFormatFloat4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x18 // API_Runtime ANALOGUE (cudaResViewFormatFloat4 = 0x18) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC1"] = {"hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x19 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed1 = 0x19) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC2"] = {"hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1a // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed2 = 0x1a) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC3"] = {"hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1b // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed3 = 0x1b) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC4"] = {"hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1c // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed4 = 0x1c) + cuda2hipRename["CU_RES_VIEW_FORMAT_SIGNED_BC4"] = {"hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1d // API_Runtime ANALOGUE (cudaResViewFormatSignedBlockCompressed4 = 0x1d) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC5"] = {"hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1e // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed5 = 0x1e) + cuda2hipRename["CU_RES_VIEW_FORMAT_SIGNED_BC5"] = {"hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f // API_Runtime ANALOGUE (cudaResViewFormatSignedBlockCompressed5 = 0x1f) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC6H"] = {"hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed6H = 0x20) + cuda2hipRename["CU_RES_VIEW_FORMAT_SIGNED_BC6H"] = {"hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x21 // API_Runtime ANALOGUE (cudaResViewFormatSignedBlockCompressed6H = 0x21) + cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC7"] = {"hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x22 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed7 = 0x22) + + + + cuda2hipRename["CUsharedconfig_enum"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUsharedconfig"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE"] = {"hipSharedMemBankSizeDefault", CONV_DEV, API_DRIVER}; + cuda2hipRename["CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_DRIVER}; + cuda2hipRename["CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_DRIVER}; + + cuda2hipRename["CUcontext"] = {"hipCtx_t", CONV_TYPE, API_DRIVER}; // TODO: // cuda2hipRename["CUctx_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUmodule"] = {"hipModule_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUmodule"] = {"hipModule_t", CONV_TYPE, API_DRIVER}; // TODO: // cuda2hipRename["CUmod_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUstream"] = {"hipStream_t", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUstream"] = {"hipStream_t", CONV_TYPE, API_DRIVER}; // TODO: // cuda2hipRename["CUstream_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; // Stream Flags - cuda2hipRename["CU_STREAM_DEFAULT"] = {"hipStreamDefault", CONV_STREAM, API_DRIVER}; - cuda2hipRename["CU_STREAM_NON_BLOCKING"] = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER}; + cuda2hipRename["CU_STREAM_DEFAULT"] = {"hipStreamDefault", CONV_STREAM, API_DRIVER}; + cuda2hipRename["CU_STREAM_NON_BLOCKING"] = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER}; // Init - cuda2hipRename["cuInit"] = {"hipInit", CONV_DRIVER, API_DRIVER}; + cuda2hipRename["cuInit"] = {"hipInit", CONV_DRIVER, API_DRIVER}; // Driver cuda2hipRename["cuDriverGetVersion"] = {"hipDriverGetVersion", CONV_DRIVER, API_DRIVER}; @@ -568,14 +816,17 @@ struct cuda2hipMap { // Events // pointer to CUevent_st cuda2hipRename["CUevent"] = {"hipEvent_t", CONV_TYPE, API_DRIVER}; - // ToDO: - // cuda2hipRename["CUevent_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; + // ToDo: + // cuda2hipRename["CUevent_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; // Event Flags + cuda2hipRename["CUevent_flags"] = {"hipEventFlags", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + // ToDo: + // cuda2hipRename["CUevent_flags_enum"] = {"hipEventFlags", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_EVENT_DEFAULT"] = {"hipEventDefault", CONV_EVENT, API_DRIVER}; cuda2hipRename["CU_EVENT_BLOCKING_SYNC"] = {"hipEventBlockingSync", CONV_EVENT, API_DRIVER}; cuda2hipRename["CU_EVENT_DISABLE_TIMING"] = {"hipEventDisableTiming", CONV_EVENT, API_DRIVER}; cuda2hipRename["CU_EVENT_INTERPROCESS"] = {"hipEventInterprocess", CONV_EVENT, API_DRIVER}; - + // Event functions cuda2hipRename["cuEventCreate"] = {"hipEventCreate", CONV_EVENT, API_DRIVER}; cuda2hipRename["cuEventDestroy_v2"] = {"hipEventDestroy", CONV_EVENT, API_DRIVER}; cuda2hipRename["cuEventElapsedTime"] = {"hipEventElapsedTime", CONV_EVENT, API_DRIVER}; @@ -627,7 +878,7 @@ struct cuda2hipMap { cuda2hipRename["cuMemsetD16_v2"] = {"hipMemsetD16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD16Async"] = {"hipMemsetD16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D16_v2"] = {"hipMemsetD2D16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemsetD2D16Async"] = {"hipMemsetD2D16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemsetD2D16Async"] = {"hipMemsetD2D16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD32_v2"] = {"hipMemset", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemsetD32Async"] = {"hipMemsetAsync", CONV_MEM, API_DRIVER}; @@ -639,6 +890,14 @@ struct cuda2hipMap { cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; + // Texture Reference Mngmnt + // Texture reference filtering modes + cuda2hipRename["CUfilter_mode"] = {"hipTextureFilterMode", CONV_TEX, API_DRIVER}; // API_Runtime ANALOGUE (cudaTextureFilterMode) + // ToDo: + // cuda2hipRename["CUfilter_mode"] = {"CUfilter_mode_enum", CONV_TEX, API_DRIVER}; // API_Runtime ANALOGUE (cudaTextureFilterMode) + cuda2hipRename["CU_TR_FILTER_MODE_POINT"] = {"hipFilterModePoint", CONV_TEX, API_DRIVER}; // 0 // API_Runtime ANALOGUE (cudaFilterModePoint = 0) + cuda2hipRename["CU_TR_FILTER_MODE_LINEAR"] = {"hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaFilterModeLinear = 1) + // Profiler // unsupported yet by HIP cuda2hipRename["cuProfilerInitialize"] = {"hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED}; @@ -676,6 +935,14 @@ struct cuda2hipMap { cuda2hipRename["MINOR_VERSION"] = {"hipLibraryMinorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["PATCH_LEVEL"] = {"hipLibraryPatchVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; + // defines + cuda2hipRename["cudaMemAttachGlobal"] = {"hipMemAttachGlobal", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_MEM_ATTACH_GLOBAL = 0x1) + cuda2hipRename["cudaMemAttachHost"] = {"hipMemAttachHost", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Driver ANALOGUE (CU_MEM_ATTACH_HOST = 0x2) + cuda2hipRename["cudaMemAttachSingle"] = {"hipMemAttachSingle", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x04 // API_Driver ANALOGUE (CU_MEM_ATTACH_SINGLE = 0x4) + + cuda2hipRename["cudaOccupancyDefault"] = {"hipOccupancyDefault", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_OCCUPANCY_DEFAULT = 0x0) + cuda2hipRename["cudaOccupancyDisableCachingOverride"] = {"hipOccupancyDisableCachingOverride", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE = 0x1) + // Error API cuda2hipRename["cudaGetLastError"] = {"hipGetLastError", CONV_ERR, API_RUNTIME}; cuda2hipRename["cudaPeekAtLastError"] = {"hipPeekAtLastError", CONV_ERR, API_RUNTIME}; @@ -766,7 +1033,7 @@ struct cuda2hipMap { cuda2hipRename["cudaHostAlloc"] = {"hipHostMalloc", CONV_MEM, API_RUNTIME}; // Memory types - cuda2hipRename["cudaMemoryType"] = {"hipMemoryType", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemoryType"] = {"hipMemoryType", CONV_MEM, API_RUNTIME}; // API_Driver ANALOGUE (no - CUmemorytype is not an analogue) cuda2hipRename["cudaMemoryTypeHost"] = {"hipMemoryTypeHost", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemoryTypeDevice"] = {"hipMemoryTypeDevice", CONV_MEM, API_RUNTIME}; @@ -838,7 +1105,6 @@ struct cuda2hipMap { cuda2hipRename["cudaEventBlockingSync"] = {"hipEventBlockingSync", CONV_EVENT, API_RUNTIME}; cuda2hipRename["cudaEventDisableTiming"] = {"hipEventDisableTiming", CONV_EVENT, API_RUNTIME}; cuda2hipRename["cudaEventInterprocess"] = {"hipEventInterprocess", CONV_EVENT, API_RUNTIME}; - // Streams cuda2hipRename["cudaStream_t"] = {"hipStream_t", CONV_TYPE, API_RUNTIME}; cuda2hipRename["cudaStreamCreate"] = {"hipStreamCreate", CONV_STREAM, API_RUNTIME}; @@ -874,93 +1140,94 @@ struct cuda2hipMap { // Attributes cuda2hipRename["cudaDeviceGetAttribute"] = {"hipDeviceGetAttribute", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxBlockDimX"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxBlockDimY"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxBlockDimZ"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxGridDimX"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxGridDimY"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxGridDimZ"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrTotalConstantMemory"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrWarpSize"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrClockRate"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMemoryClockRate"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMultiProcessorCount"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrComputeMode"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrL2CacheSize"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrComputeCapabilityMajor"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrComputeCapabilityMinor"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrConcurrentKernels"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrPciBusId"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrPciDeviceId"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDevAttrIsMultiGpuBoard"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME}; - // unsupported yet by HIP - cuda2hipRename["cudaDevAttrMaxPitch"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrTextureAlignment"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME}; // API_DRIVER ANALOGUE (CUdevice_attribute) + cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME}; // 1 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1) + cuda2hipRename["cudaDevAttrMaxBlockDimX"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME}; // 2 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2) + cuda2hipRename["cudaDevAttrMaxBlockDimY"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME}; // 3 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3) + cuda2hipRename["cudaDevAttrMaxBlockDimZ"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME}; // 4 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z = 4) + cuda2hipRename["cudaDevAttrMaxGridDimX"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME}; // 5 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 5) + cuda2hipRename["cudaDevAttrMaxGridDimY"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME}; // 6 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 6) + cuda2hipRename["cudaDevAttrMaxGridDimZ"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME}; // 7 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X = 7) + cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME}; // 8 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK = 8) + cuda2hipRename["cudaDevAttrTotalConstantMemory"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME}; // 9 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY =9) + cuda2hipRename["cudaDevAttrWarpSize"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME}; // 10 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_WARP_SIZE = 10) + cuda2hipRename["cudaDevAttrMaxPitch"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 11 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_PITCH = 11) + cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME}; // 12 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK = 12) + cuda2hipRename["cudaDevAttrClockRate"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME}; // 13 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CLOCK_RATE = 13) + cuda2hipRename["cudaDevAttrTextureAlignment"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 14 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT = 14) // Is not deprecated as CUDA Driver's API analogue CU_DEVICE_ATTRIBUTE_GPU_OVERLAP - cuda2hipRename["cudaDevAttrGpuOverlap"] = {"hipDeviceAttributeGpuOverlap", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrKernelExecTimeout"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrIntegrated"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrCanMapHostMemory"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture1DWidth"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DWidth"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DHeight"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DWidth"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DHeight"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DDepth"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLayeredWidth"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLayeredHeight"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLayeredLayers"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrSurfaceAlignment"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrEccEnabled"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrTccDriver"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrAsyncEngineCount"] = {"hipDevAttrAsyncEngineCount", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrUnifiedAddressing"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture1DLayeredWidth"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture1DLayeredLayers"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DGatherWidth"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DGatherHeight"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DWidthAlt"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DHeightAlt"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture3DDepthAlt"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrPciDomainId"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrTexturePitchAlignment"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTextureCubemapWidth"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredWidth"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredLayers"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface1DWidth"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface2DWidth"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface2DHeight"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface3DWidth"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface3DHeight"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface3DDepth"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface1DLayeredWidth"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface1DLayeredLayers"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface2DLayeredWidth"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface2DLayeredHeight"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurface2DLayeredLayers"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurfaceCubemapWidth"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredWidth"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredLayers"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture1DLinearWidth"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLinearWidth"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLinearHeight"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DLinearPitch"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedWidth"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedHeight"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxTexture1DMipmappedWidth"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrStreamPrioritiesSupported"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrGlobalL1CacheSupported"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrLocalL1CacheSupported"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMaxRegistersPerMultiprocessor"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrManagedMemory"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrMultiGpuBoardGroupID"] = {"hipDeviceAttributeMultiGpuBoardGroupID", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaDevAttrGpuOverlap"] = {"hipDeviceAttributeGpuOverlap", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 15 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_GPU_OVERLAP = 15) + cuda2hipRename["cudaDevAttrMultiProcessorCount"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME}; // 16 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT = 16) + cuda2hipRename["cudaDevAttrKernelExecTimeout"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 17 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT = 17) + cuda2hipRename["cudaDevAttrIntegrated"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 18 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_INTEGRATED = 18) + cuda2hipRename["cudaDevAttrCanMapHostMemory"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 19 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY = 19) + cuda2hipRename["cudaDevAttrComputeMode"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME}; // 20 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_COMPUTE_MODE = 20) + cuda2hipRename["cudaDevAttrMaxTexture1DWidth"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 21 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH = 21) + cuda2hipRename["cudaDevAttrMaxTexture2DWidth"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 22 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH = 22) + cuda2hipRename["cudaDevAttrMaxTexture2DHeight"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 23 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT = 23) + cuda2hipRename["cudaDevAttrMaxTexture3DWidth"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 24 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH = 24) + cuda2hipRename["cudaDevAttrMaxTexture3DHeight"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 25 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT = 25) + cuda2hipRename["cudaDevAttrMaxTexture3DDepth"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 26 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH = 26) + cuda2hipRename["cudaDevAttrMaxTexture2DLayeredWidth"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 27 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH = 27) + cuda2hipRename["cudaDevAttrMaxTexture2DLayeredHeight"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 28 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT = 28) + cuda2hipRename["cudaDevAttrMaxTexture2DLayeredLayers"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 29 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS = 29) + cuda2hipRename["cudaDevAttrSurfaceAlignment"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 30 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT = 30) + cuda2hipRename["cudaDevAttrConcurrentKernels"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME}; // 31 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS = 31) + cuda2hipRename["cudaDevAttrEccEnabled"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 32 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_ECC_ENABLED = 32) + cuda2hipRename["cudaDevAttrPciBusId"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME}; // 33 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_PCI_BUS_ID = 33) + cuda2hipRename["cudaDevAttrPciDeviceId"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME}; // 34 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID = 34) + cuda2hipRename["cudaDevAttrTccDriver"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 35 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_TCC_DRIVER = 35) + cuda2hipRename["cudaDevAttrMemoryClockRate"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME}; // 36 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE = 36) + cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME}; // 37 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH = 37) + cuda2hipRename["cudaDevAttrL2CacheSize"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME}; // 38 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE = 38) + cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME}; // 39 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR = 39) + cuda2hipRename["cudaDevAttrAsyncEngineCount"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 40 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT = 40) + cuda2hipRename["cudaDevAttrUnifiedAddressing"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 41 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING = 41) + cuda2hipRename["cudaDevAttrMaxTexture1DLayeredWidth"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 42 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH = 42) + cuda2hipRename["cudaDevAttrMaxTexture1DLayeredLayers"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 43 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS = 43) + // 44 - no + cuda2hipRename["cudaDevAttrMaxTexture2DGatherWidth"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 45 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH = 45) + cuda2hipRename["cudaDevAttrMaxTexture2DGatherHeight"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 46 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT = 46) + cuda2hipRename["cudaDevAttrMaxTexture3DWidthAlt"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 47 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE = 47) + cuda2hipRename["cudaDevAttrMaxTexture3DHeightAlt"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 48 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE = 48) + cuda2hipRename["cudaDevAttrMaxTexture3DDepthAlt"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 49 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE = 49) + cuda2hipRename["cudaDevAttrPciDomainId"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 50 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID = 50) + cuda2hipRename["cudaDevAttrTexturePitchAlignment"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 51 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT = 51) + cuda2hipRename["cudaDevAttrMaxTextureCubemapWidth"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 52 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH = 52) + cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredWidth"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 53 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH = 53) + cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredLayers"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 54 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS = 54) + cuda2hipRename["cudaDevAttrMaxSurface1DWidth"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 55 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH = 55) + cuda2hipRename["cudaDevAttrMaxSurface2DWidth"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 56 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH = 56) + cuda2hipRename["cudaDevAttrMaxSurface2DHeight"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 57 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT = 57) + cuda2hipRename["cudaDevAttrMaxSurface3DWidth"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 58 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH = 58) + cuda2hipRename["cudaDevAttrMaxSurface3DHeight"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 59 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT = 59) + cuda2hipRename["cudaDevAttrMaxSurface3DDepth"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 60 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH = 60) + cuda2hipRename["cudaDevAttrMaxSurface1DLayeredWidth"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 61 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH = 61) + cuda2hipRename["cudaDevAttrMaxSurface1DLayeredLayers"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 62 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS = 62) + cuda2hipRename["cudaDevAttrMaxSurface2DLayeredWidth"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 63 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH = 63) + cuda2hipRename["cudaDevAttrMaxSurface2DLayeredHeight"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 64 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT = 64) + cuda2hipRename["cudaDevAttrMaxSurface2DLayeredLayers"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 65 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS = 65) + cuda2hipRename["cudaDevAttrMaxSurfaceCubemapWidth"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 66 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH = 66) + cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredWidth"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 67 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH = 67) + cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredLayers"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 68 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS = 68) + cuda2hipRename["cudaDevAttrMaxTexture1DLinearWidth"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 69 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH = 69) + cuda2hipRename["cudaDevAttrMaxTexture2DLinearWidth"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 70 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH = 70) + cuda2hipRename["cudaDevAttrMaxTexture2DLinearHeight"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 71 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT = 71) + cuda2hipRename["cudaDevAttrMaxTexture2DLinearPitch"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 72 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH = 72) + cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedWidth"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 73 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH = 73) + cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedHeight"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 74 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT = 74) + cuda2hipRename["cudaDevAttrComputeCapabilityMajor"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME}; // 75 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR = 75) + cuda2hipRename["cudaDevAttrComputeCapabilityMinor"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_RUNTIME}; // 76 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR = 76) + cuda2hipRename["cudaDevAttrMaxTexture1DMipmappedWidth"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 77 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH = 77) + cuda2hipRename["cudaDevAttrStreamPrioritiesSupported"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 78 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED = 78) + cuda2hipRename["cudaDevAttrGlobalL1CacheSupported"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 79 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED = 79) + cuda2hipRename["cudaDevAttrLocalL1CacheSupported"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 80 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED = 80) + cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME}; // 81 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR = 81) + cuda2hipRename["cudaDevAttrMaxRegistersPerMultiprocessor"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 82 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR = 82) + cuda2hipRename["cudaDevAttrManagedMemory"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 83 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY = 83) + cuda2hipRename["cudaDevAttrIsMultiGpuBoard"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME}; // 84 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD = 84) + cuda2hipRename["cudaDevAttrMultiGpuBoardGroupID"] = {"hipDeviceAttributeMultiGpuBoardGroupID", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 85 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85) + // unsupported yet by HIP [CUDA 8.0.44] cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; @@ -970,10 +1237,11 @@ struct cuda2hipMap { cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // Pointer Attributes - cuda2hipRename["cudaPointerAttributes"] = {"hipPointerAttribute_t", CONV_TYPE, API_RUNTIME}; - cuda2hipRename["cudaPointerGetAttributes"] = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME}; + // struct cudaPointerAttributes + cuda2hipRename["cudaPointerAttributes"] = {"hipPointerAttribute_t", CONV_TYPE, API_RUNTIME}; + cuda2hipRename["cudaPointerGetAttributes"] = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME}; - cuda2hipRename["cudaHostGetDevicePointer"] = {"hipHostGetDevicePointer", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaHostGetDevicePointer"] = {"hipHostGetDevicePointer", CONV_MEM, API_RUNTIME}; // Device cuda2hipRename["cudaDeviceProp"] = {"hipDeviceProp_t", CONV_TYPE, API_RUNTIME}; @@ -985,11 +1253,11 @@ struct cuda2hipMap { cuda2hipRename["cudaSetValidDevices"] = {"hipSetValidDevices", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // Compute mode - cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaComputeModeExclusive"] = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaComputeModeProhibited"] = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaComputeModeExclusiveProcess"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) + cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 + cuda2hipRename["cudaComputeModeExclusive"] = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["cudaComputeModeProhibited"] = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["cudaComputeModeExclusiveProcess"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // Device Flags // unsupported yet by HIP @@ -1020,11 +1288,11 @@ struct cuda2hipMap { // Execution control // CUDA function cache configurations - cuda2hipRename["cudaFuncCache"] = {"hipFuncCache_t", CONV_CACHE, API_RUNTIME}; - cuda2hipRename["cudaFuncCachePreferNone"] = {"hipFuncCachePreferNone", CONV_CACHE, API_RUNTIME}; - cuda2hipRename["cudaFuncCachePreferShared"] = {"hipFuncCachePreferShared", CONV_CACHE, API_RUNTIME}; - cuda2hipRename["cudaFuncCachePreferL1"] = {"hipFuncCachePreferL1", CONV_CACHE, API_RUNTIME}; - cuda2hipRename["cudaFuncCachePreferEqual"] = {"hipFuncCachePreferEqual", CONV_CACHE, API_RUNTIME}; + cuda2hipRename["cudaFuncCache"] = {"hipFuncCache_t", CONV_CACHE, API_RUNTIME}; // API_Driver ANALOGUE (CUfunc_cache) + cuda2hipRename["cudaFuncCachePreferNone"] = {"hipFuncCachePreferNone", CONV_CACHE, API_RUNTIME}; // 0 // API_Driver ANALOGUE (CU_FUNC_CACHE_PREFER_NONE = 0x00) + cuda2hipRename["cudaFuncCachePreferShared"] = {"hipFuncCachePreferShared", CONV_CACHE, API_RUNTIME}; // 1 // API_Driver ANALOGUE (CU_FUNC_CACHE_PREFER_SHARED = 0x01) + cuda2hipRename["cudaFuncCachePreferL1"] = {"hipFuncCachePreferL1", CONV_CACHE, API_RUNTIME}; // 2 // API_Driver ANALOGUE (CU_FUNC_CACHE_PREFER_L1 = 0x02) + cuda2hipRename["cudaFuncCachePreferEqual"] = {"hipFuncCachePreferEqual", CONV_CACHE, API_RUNTIME}; // 3 // API_Driver ANALOGUE (CU_FUNC_CACHE_PREFER_EQUAL = 0x03) // Execution control functions // unsupported yet by HIP @@ -1062,7 +1330,9 @@ struct cuda2hipMap { cuda2hipRename["cudaDeviceEnablePeerAccess"] = {"hipDeviceEnablePeerAccess", CONV_DEV, API_RUNTIME}; cuda2hipRename["cudaMemcpyPeerAsync"] = {"hipMemcpyPeerAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpyPeer"] = {"hipMemcpyPeer", CONV_MEM, API_RUNTIME}; - cuda2hipRename["cudaIpcMemLazyEnablePeerAccess"] = {"hipIpcMemLazyEnablePeerAccess", CONV_ERR, API_RUNTIME}; + + // #define cudaIpcMemLazyEnablePeerAccess 0x01 + cuda2hipRename["cudaIpcMemLazyEnablePeerAccess"] = {"hipIpcMemLazyEnablePeerAccess", CONV_TYPE, API_RUNTIME}; // 0x01 // API_Driver ANALOGUE (CU_IPC_MEM_LAZY_ENABLE_PEER_ACCESS = 0x1) // Shared memory cuda2hipRename["cudaDeviceSetSharedMemConfig"] = {"hipDeviceSetSharedMemConfig", CONV_DEV, API_RUNTIME}; @@ -1078,14 +1348,12 @@ struct cuda2hipMap { cuda2hipRename["cudaSharedMemBankSizeEightByte"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_RUNTIME}; // Limits - cuda2hipRename["cudaLimit"] = {"hipLimit_t", CONV_DEV, API_RUNTIME}; - // unsupported yet by HIP - cuda2hipRename["cudaLimitStackSize"] = {"hipLimitStackSize", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaLimitPrintfFifoSize"] = {"hipLimitPrintfFifoSize", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaLimitMallocHeapSize"] = {"hipLimitMallocHeapSize", CONV_DEV, API_RUNTIME}; - // unsupported yet by HIP - cuda2hipRename["cudaLimitDevRuntimeSyncDepth"] = {"hipLimitPrintfFifoSize", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaLimitDevRuntimePendingLaunchCount"] = {"hipLimitMallocHeapSize", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaLimit"] = {"hipLimit_t", CONV_TYPE, API_RUNTIME}; // API_Driver ANALOGUE (CUlimit) + cuda2hipRename["cudaLimitStackSize"] = {"hipLimitStackSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_LIMIT_STACK_SIZE = 0x00) + cuda2hipRename["cudaLimitPrintfFifoSize"] = {"hipLimitPrintfFifoSize", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_LIMIT_PRINTF_FIFO_SIZE = 0x01) + cuda2hipRename["cudaLimitMallocHeapSize"] = {"hipLimitMallocHeapSize", CONV_TYPE, API_RUNTIME}; // 0x02 // API_Driver ANALOGUE (CU_LIMIT_MALLOC_HEAP_SIZE = 0x02) + cuda2hipRename["cudaLimitDevRuntimeSyncDepth"] = {"hipLimitDevRuntimeSyncDepth", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Driver ANALOGUE (CU_LIMIT_DEV_RUNTIME_SYNC_DEPTH = 0x03) + cuda2hipRename["cudaLimitDevRuntimePendingLaunchCount"] = {"hipLimitDevRuntimePendingLaunchCount", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 0x04 // API_Driver ANALOGUE (CU_LIMIT_DEV_RUNTIME_PENDING_LAUNCH_COUNT = 0x04) cuda2hipRename["cudaDeviceGetLimit"] = {"hipDeviceGetLimit", CONV_DEV, API_RUNTIME}; @@ -1108,10 +1376,9 @@ struct cuda2hipMap { // unsupported yet by HIP cuda2hipRename["cudaReadModeNormalizedFloat"] = {"hipReadModeNormalizedFloat", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaTextureFilterMode"] = {"hipTextureFilterMode", CONV_TEX, API_RUNTIME}; - cuda2hipRename["cudaFilterModePoint"] = {"hipFilterModePoint", CONV_TEX, API_RUNTIME}; - // unsupported yet by HIP - cuda2hipRename["cudaFilterModeLinear"] = {"hipFilterModeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaTextureFilterMode"] = {"hipTextureFilterMode", CONV_TEX, API_RUNTIME}; // API_DRIVER ANALOGUE (CUfilter_mode) + cuda2hipRename["cudaFilterModePoint"] = {"hipFilterModePoint", CONV_TEX, API_RUNTIME}; // 0 // API_DRIVER ANALOGUE (CU_TR_FILTER_MODE_POINT = 0) + cuda2hipRename["cudaFilterModeLinear"] = {"hipFilterModeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_DRIVER ANALOGUE (CU_TR_FILTER_MODE_POINT = 1) cuda2hipRename["cudaBindTexture"] = {"hipBindTexture", CONV_TEX, API_RUNTIME}; cuda2hipRename["cudaUnbindTexture"] = {"hipUnbindTexture", CONV_TEX, API_RUNTIME}; @@ -1131,7 +1398,7 @@ struct cuda2hipMap { cuda2hipRename["cudaChannelFormatDesc"] = {"hipChannelFormatDesc", CONV_TEX, API_RUNTIME}; cuda2hipRename["cudaCreateChannelDesc"] = {"hipCreateChannelDesc", CONV_TEX, API_RUNTIME}; // unsupported yet by HIP - cuda2hipRename["cudaGetChannelDesc"] = {"hipGetChannelDesc", CONV_TEX, API_RUNTIME}; + cuda2hipRename["cudaGetChannelDesc"] = {"hipGetChannelDesc", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // Texture Object Management // structs @@ -1139,49 +1406,52 @@ struct cuda2hipMap { cuda2hipRename["cudaResourceDesc"] = {"hipResourceDesc", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaResourceViewDesc"] = {"hipResourceViewDesc", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaTextureDesc"] = {"hipTextureDesc", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - // enums - // unsupported yet by HIP - cuda2hipRename["cudaResourceType"] = {"hipResourceType", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResourceTypeArray"] = {"hipResourceTypeArray", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResourceTypeMipmappedArray"] = {"hipResourceTypeMipmappedArray", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResourceTypeLinear"] = {"hipResourceTypeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResourceTypePitch2D"] = {"hipResourceTypePitch2D", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResourceViewFormat"] = {"hipResourceViewFormat", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatNone"] = {"hipResViewFormatNone", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedChar1"] = {"hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedChar2"] = {"hipResViewFormatUnsignedChar2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedChar4"] = {"hipResViewFormatUnsignedChar4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedChar1"] = {"hipResViewFormatSignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedChar2"] = {"hipResViewFormatSignedChar2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedChar4"] = {"hipResViewFormatSignedChar4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedShort1"] = {"hipResViewFormatUnsignedShort1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedShort2"] = {"hipResViewFormatUnsignedShort2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedShort4"] = {"hipResViewFormatUnsignedShort4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedShort1"] = {"hipResViewFormatSignedShort1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedShort2"] = {"hipResViewFormatSignedShort2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedShort4"] = {"hipResViewFormatSignedShort4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedInt1"] = {"hipResViewFormatUnsignedInt1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedInt2"] = {"hipResViewFormatUnsignedInt2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedInt4"] = {"hipResViewFormatUnsignedInt4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedInt1"] = {"hipResViewFormatSignedInt1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedInt2"] = {"hipResViewFormatSignedInt2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedInt4"] = {"hipResViewFormatSignedInt4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatHalf1"] = {"hipResViewFormatHalf1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatHalf2"] = {"hipResViewFormatHalf2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatHalf4"] = {"hipResViewFormatHalf4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatFloat1"] = {"hipResViewFormatFloat1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatFloat2"] = {"hipResViewFormatFloat2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatFloat4"] = {"hipResViewFormatFloat4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed1"] = {"hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed2"] = {"hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed3"] = {"hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed4"] = {"hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedBlockCompressed4"] = {"hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed5"] = {"hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedBlockCompressed5"] = {"hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatSignedBlockCompressed6H"] = {"hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed7"] = {"hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; + // enums + // enum cudaResourceType + cuda2hipRename["cudaResourceType"] = {"hipResourceType", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUresourcetype) + cuda2hipRename["cudaResourceTypeArray"] = {"hipResourceTypeArray", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_ARRAY = 0x00) + cuda2hipRename["cudaResourceTypeMipmappedArray"] = {"hipResourceTypeMipmappedArray", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_MIPMAPPED_ARRAY = 0x01) + cuda2hipRename["cudaResourceTypeLinear"] = {"hipResourceTypeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_LINEAR = 0x02) + cuda2hipRename["cudaResourceTypePitch2D"] = {"hipResourceTypePitch2D", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_PITCH2D = 0x03) + + + cuda2hipRename["cudaResourceViewFormat"] = {"hipResourceViewFormat", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUresourceViewFormat) + cuda2hipRename["cudaResViewFormatNone"] = {"hipResViewFormatNone", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_NONE = 0x00) + cuda2hipRename["cudaResViewFormatUnsignedChar1"] = {"hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01) + cuda2hipRename["cudaResViewFormatUnsignedChar2"] = {"hipResViewFormatUnsignedChar2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_2X8 = 0x02) + cuda2hipRename["cudaResViewFormatUnsignedChar4"] = {"hipResViewFormatUnsignedChar4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_4X8 = 0x03) + cuda2hipRename["cudaResViewFormatSignedChar1"] = {"hipResViewFormatSignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x04 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_1X8 = 0x04) + cuda2hipRename["cudaResViewFormatSignedChar2"] = {"hipResViewFormatSignedChar2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x05 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_2X8 = 0x05) + cuda2hipRename["cudaResViewFormatSignedChar4"] = {"hipResViewFormatSignedChar4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x06 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_4X8 = 0x06) + cuda2hipRename["cudaResViewFormatUnsignedShort1"] = {"hipResViewFormatUnsignedShort1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x07 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_1X16 = 0x07) + cuda2hipRename["cudaResViewFormatUnsignedShort2"] = {"hipResViewFormatUnsignedShort2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x08 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_2X16 = 0x08) + cuda2hipRename["cudaResViewFormatUnsignedShort4"] = {"hipResViewFormatUnsignedShort4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x09 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_4X16 = 0x09) + cuda2hipRename["cudaResViewFormatSignedShort1"] = {"hipResViewFormatSignedShort1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0a // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_1X16 = 0x0a) + cuda2hipRename["cudaResViewFormatSignedShort2"] = {"hipResViewFormatSignedShort2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0b // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_2X16 = 0x0b) + cuda2hipRename["cudaResViewFormatSignedShort4"] = {"hipResViewFormatSignedShort4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0c // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_4X16 = 0x0c) + cuda2hipRename["cudaResViewFormatUnsignedInt1"] = {"hipResViewFormatUnsignedInt1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0d // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_1X32 = 0x0d) + cuda2hipRename["cudaResViewFormatUnsignedInt2"] = {"hipResViewFormatUnsignedInt2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0e // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_2X32 = 0x0e) + cuda2hipRename["cudaResViewFormatUnsignedInt4"] = {"hipResViewFormatUnsignedInt4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x0f // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_4X32 = 0x0f) + cuda2hipRename["cudaResViewFormatSignedInt1"] = {"hipResViewFormatSignedInt1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x10 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_1X32 = 0x10) + cuda2hipRename["cudaResViewFormatSignedInt2"] = {"hipResViewFormatSignedInt2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x11 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_2X32 = 0x11) + cuda2hipRename["cudaResViewFormatSignedInt4"] = {"hipResViewFormatSignedInt4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x12 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SINT_4X32 = 0x12) + cuda2hipRename["cudaResViewFormatHalf1"] = {"hipResViewFormatHalf1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x13 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_1X16 = 0x13) + cuda2hipRename["cudaResViewFormatHalf2"] = {"hipResViewFormatHalf2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x14 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_2X16 = 0x14) + cuda2hipRename["cudaResViewFormatHalf4"] = {"hipResViewFormatHalf4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x15 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_4X16 = 0x15) + cuda2hipRename["cudaResViewFormatFloat1"] = {"hipResViewFormatFloat1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x16 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_1X32 = 0x16) + cuda2hipRename["cudaResViewFormatFloat2"] = {"hipResViewFormatFloat2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x17 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_2X32 = 0x17) + cuda2hipRename["cudaResViewFormatFloat4"] = {"hipResViewFormatFloat4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x18 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_FLOAT_4X32 = 0x18) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed1"] = {"hipResViewFormatUnsignedBlockCompressed1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x19 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC1 = 0x19) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed2"] = {"hipResViewFormatUnsignedBlockCompressed2", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1a // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC2 = 0x1a) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed3"] = {"hipResViewFormatUnsignedBlockCompressed3", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1b // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC3 = 0x1b) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed4"] = {"hipResViewFormatUnsignedBlockCompressed4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1c // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC4 = 0x1c) + cuda2hipRename["cudaResViewFormatSignedBlockCompressed4"] = {"hipResViewFormatSignedBlockCompressed4", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1d // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SIGNED_BC4 = 0x1d) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed5"] = {"hipResViewFormatUnsignedBlockCompressed5", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1e // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC5 = 0x1e) + cuda2hipRename["cudaResViewFormatSignedBlockCompressed5"] = {"hipResViewFormatSignedBlockCompressed5", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x1f // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SIGNED_BC5 = 0x1f) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed6H"] = {"hipResViewFormatUnsignedBlockCompressed6H", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC6H = 0x20) + cuda2hipRename["cudaResViewFormatSignedBlockCompressed6H"] = {"hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x21 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_SIGNED_BC6H = 0x21) + cuda2hipRename["cudaResViewFormatUnsignedBlockCompressed7"] = {"hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x22 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UNSIGNED_BC7 = 0x22) cuda2hipRename["cudaTextureAddressMode"] = {"hipTextureAddressMode", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaAddressModeWrap"] = {"hipAddressModeWrap", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; @@ -1255,17 +1525,19 @@ struct cuda2hipMap { cuda2hipRename["cudaGraphicsCubeFacePositiveZ"] = {"hipGraphicsCubeFacePositiveZ", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaGraphicsCubeFaceNegativeZ"] = {"hipGraphicsCubeFaceNegativeZ", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsMapFlags"] = {"hipGraphicsMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsMapFlagsNone"] = {"hipGraphicsMapFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsMapFlagsReadOnly"] = {"hipGraphicsMapFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsMapFlagsWriteDiscard"] = {"hipGraphicsMapFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; + // enum cudaGraphicsMapFlags + cuda2hipRename["cudaGraphicsMapFlags"] = {"hipGraphicsMapFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUgraphicsMapResourceFlags) + cuda2hipRename["cudaGraphicsMapFlagsNone"] = {"hipGraphicsMapFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 0 // API_Driver ANALOGUE (CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00) + cuda2hipRename["cudaGraphicsMapFlagsReadOnly"] = {"hipGraphicsMapFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01) + cuda2hipRename["cudaGraphicsMapFlagsWriteDiscard"] = {"hipGraphicsMapFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD = 0x02) - cuda2hipRename["cudaGraphicsRegisterFlags"] = {"hipGraphicsRegisterFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsRegisterFlagsNone"] = {"hipGraphicsRegisterFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsRegisterFlagsReadOnly"] = {"hipGraphicsRegisterFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsRegisterFlagsWriteDiscard"] = {"hipGraphicsRegisterFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsRegisterFlagsSurfaceLoadStore"] = {"hipGraphicsRegisterFlagsSurfaceLoadStore", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaGraphicsRegisterFlagsTextureGather"] = {"hipGraphicsRegisterFlagsTextureGather", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; + // enum cudaGraphicsRegisterFlags + cuda2hipRename["cudaGraphicsRegisterFlags"] = {"hipGraphicsRegisterFlags", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUgraphicsRegisterFlags) + cuda2hipRename["cudaGraphicsRegisterFlagsNone"] = {"hipGraphicsRegisterFlagsNone", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 0 // API_Driver ANALOGUE (CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE = 0x00) + cuda2hipRename["cudaGraphicsRegisterFlagsReadOnly"] = {"hipGraphicsRegisterFlagsReadOnly", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY = 0x01) + cuda2hipRename["cudaGraphicsRegisterFlagsWriteDiscard"] = {"hipGraphicsRegisterFlagsWriteDiscard", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD = 0x02) + cuda2hipRename["cudaGraphicsRegisterFlagsSurfaceLoadStore"] = {"hipGraphicsRegisterFlagsSurfaceLoadStore", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_GRAPHICS_REGISTER_FLAGS_SURFACE_LDST = 0x04) + cuda2hipRename["cudaGraphicsRegisterFlagsTextureGather"] = {"hipGraphicsRegisterFlagsTextureGather", CONV_GRAPHICS, API_RUNTIME, HIP_UNSUPPORTED}; // 8 // API_Driver ANALOGUE (CU_GRAPHICS_REGISTER_FLAGS_TEXTURE_GATHER = 0x08) //---------------------------------------BLAS-------------------------------------// // Blas types From 268c7b035a566eb123e0394ad5646c5fdc7530a5 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Sat, 22 Apr 2017 01:01:31 +0300 Subject: [PATCH 039/171] [HIPIFY] [DOC] Readme.md update: Ubuntu 16.04 support [ROCm/clr commit: f66780f39bad82215e4f9f71d429c3735157475d] --- projects/clr/hipamd/hipify-clang/README.md | 29 ++++++++++++++++------ 1 file changed, 22 insertions(+), 7 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/README.md b/projects/clr/hipamd/hipify-clang/README.md index 850dfb3ffa..c0d74dbe48 100644 --- a/projects/clr/hipamd/hipify-clang/README.md +++ b/projects/clr/hipamd/hipify-clang/README.md @@ -13,24 +13,31 @@ `hipify-clang` is a clang-based tool which can automate the translation of CUDA source code into portable HIP C++. The tool can automatically add extra HIP arguments (notably the "hipLaunchParm" required at the beginning of every HIP kernel call). -`hipify-clang` has some additional dependencies explained below and can be built as a separate make step. The instructions below are specifically for **Ubuntu 14.04** +`hipify-clang` has some additional dependencies explained below and can be built as a separate make step. The instructions below are specifically for **Ubuntu 14.04** and **Ubuntu 16.04**. ### Build and install - Download and unpack clang+llvm 3.8 binary package preqrequisite. + +**Ubuntu 14.04**: ```shell wget http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz tar xvfJ clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04.tar.xz ``` +**Ubuntu 16.04**: +```shell +wget http://llvm.org/releases/3.8.0/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz +tar xvfJ clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-16.04.tar.xz +``` - Enable build of hipify-clang and specify path to LLVM. -Note HIPIFY_CLANG_LLVM_DIR must be a full absolute path to the location extracted above. Here's an example assuming we extract the clang 3.8 package into ~/HIP/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/ +Note HIPIFY_CLANG_LLVM_DIR must be a full absolute path to the location extracted above. Here's an example assuming we extract the clang 3.8 package into ~/HIP/clang+llvm-3.8.0/ ```shell cd HIP mkdir build cd build -cmake -DHIPIFY_CLANG_LLVM_DIR=~/HIP/clang+llvm-3.8.0-x86_64-linux-gnu-ubuntu-14.04/ -DCMAKE_BUILD_TYPE=Release .. +cmake -DHIPIFY_CLANG_LLVM_DIR=~/HIP/clang+llvm-3.8.0/ -DCMAKE_BUILD_TYPE=Release .. make make install ``` @@ -41,13 +48,20 @@ make install In the case when `hipify-clang` doesn't find cuda headers, it reports various errors about unknown keywords (e.g. '\__global\__'), API function names (e.g. 'cudaMalloc'), syntax (e.g. 'foo<<<1,n>>>(...)'), etc. -To install CUDA headers, download the "deb(network)" variant of the target installer from https://developer.nvidia.com/cuda-downloads. The commands below show how to download and install a recent version from http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb. +To install CUDA headers, download the "deb(network)" variant of the target installer. + +**Ubuntu 14.04**: ```shell wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_7.5-18_amd64.deb sudo dpkg -i cuda-repo-ubuntu1404_7.5-18_amd64.deb sudo apt-get update && sudo apt-get install cuda-minimal-build-7-5 cuda-curand-dev-7-5 ``` - +**Ubuntu 16.04**: +```shell +wget http://archive.ubuntu.com/ubuntu/pool/multiverse/n/nvidia-cuda-toolkit/nvidia-cuda-toolkit_7.5.18-0ubuntu1_amd64.deb +sudo dpkg -i nvidia-cuda-toolkit_7.5.18-0ubuntu1_amd64.deb +sudo apt-get update && sudo apt-get install cuda-minimal-build-7-5 cuda-curand-dev-7-5 +``` To set additional options like Language Selection (only "-x cuda" is supported), Preprocessor Definition (-D), Include Path (-I), etc., options delimiter "--" should be used before them, for instance: ```shell @@ -58,10 +72,11 @@ Delimiter "--" is used to separate hipify-clang options (before the delimiter) f Option "-x clang" is also worth specifying in order to convert source CUDA files with extensions other than standard extensions (*.cu, *.cuh). -#### Disclaimer +## Disclaimer The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of noninfringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD's products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale. AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. -Copyright (c) 2014-2016 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2014-2017 Advanced Micro Devices, Inc. All rights reserved. + From f1b0693479960610f2c1f47a450ff9e6910856f0 Mon Sep 17 00:00:00 2001 From: James Edwards Date: Fri, 21 Apr 2017 22:34:26 -0500 Subject: [PATCH 040/171] Properly link hip cmake file into top level lib directory. Change-Id: I2113a86ca6985f34fd0cfb091abdbce0f632cfc2 [ROCm/clr commit: c33c84a596ef8e76cd996ca1ca64db6106902ec7] --- projects/clr/hipamd/packaging/hip_hcc.postinst | 15 ++++++++++----- projects/clr/hipamd/packaging/hip_hcc.prerm | 10 +++++++--- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/projects/clr/hipamd/packaging/hip_hcc.postinst b/projects/clr/hipamd/packaging/hip_hcc.postinst index 14179db767..e7d53b742b 100755 --- a/projects/clr/hipamd/packaging/hip_hcc.postinst +++ b/projects/clr/hipamd/packaging/hip_hcc.postinst @@ -8,17 +8,22 @@ popd () { } ROCMDIR=/opt/rocm -HIPDIR=$ROCMDIR/hip - -# Soft-link to libraries -HIPLIBFILES=$HIPDIR/lib/* ROCMLIBDIR=$ROCMDIR/lib +HIPDIR=$ROCMDIR/hip +HIPLIBDIR=$ROCMDIR/hip/lib + +# Soft-link to library files +HIPLIBFILES=$(ls -aF $HIPLIBDIR | grep -v [-/$]) mkdir -p $ROCMLIBDIR +mkdir -p $ROCMLIBDIR/cmake pushd $ROCMLIBDIR for f in $HIPLIBFILES do ln -s $f $(basename $f) done - ln -s $HIPDIR/lib/.hipInfo .hipInfo +# Make the hip cmake directory link. +pushd cmake +ln -s $HIPLIBDIR/cmake/hip hip +popd popd diff --git a/projects/clr/hipamd/packaging/hip_hcc.prerm b/projects/clr/hipamd/packaging/hip_hcc.prerm index dda313a3a4..ee64aea632 100755 --- a/projects/clr/hipamd/packaging/hip_hcc.prerm +++ b/projects/clr/hipamd/packaging/hip_hcc.prerm @@ -9,17 +9,21 @@ popd () { } ROCMDIR=/opt/rocm +ROCMLIBDIR=$ROCMDIR/lib HIPDIR=$ROCMDIR/hip +HIPLIBDIR=$ROCMDIR/hip/lib # Remove soft-links to libraries -HIPLIBFILES=$HIPDIR/lib/* -ROCMLIBDIR=$ROCMDIR/lib +HIPLIBFILES=$(ls -aF $HIPLIBDIR | grep -v [-/$]) pushd $ROCMLIBDIR for f in $HIPLIBFILES do rm $(basename $f) done -rm .hipInfo +pushd cmake +unlink hip +popd +rmdir --ignore-fail-on-non-empty cmake popd rmdir --ignore-fail-on-non-empty $ROCMLIBDIR From dc0c44713f9b3543edb6f3c3fe91484e40205065 Mon Sep 17 00:00:00 2001 From: James Edwards Date: Sat, 22 Apr 2017 15:54:14 -0500 Subject: [PATCH 041/171] Specify full path of hip libraries in link file. Change-Id: I49b788f3489e7abff6b11006ff97fdfca4e5942c [ROCm/clr commit: 197a29f14263cadaaa33727562a195069b9075d3] --- projects/clr/hipamd/packaging/hip_hcc.postinst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/packaging/hip_hcc.postinst b/projects/clr/hipamd/packaging/hip_hcc.postinst index e7d53b742b..c7f9c3184c 100755 --- a/projects/clr/hipamd/packaging/hip_hcc.postinst +++ b/projects/clr/hipamd/packaging/hip_hcc.postinst @@ -19,7 +19,7 @@ mkdir -p $ROCMLIBDIR/cmake pushd $ROCMLIBDIR for f in $HIPLIBFILES do - ln -s $f $(basename $f) + ln -s $HIPLIBDIR/$f $(basename $f) done # Make the hip cmake directory link. pushd cmake From 2650e1fde7519170c65369ecbc964071569d174f Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 24 Apr 2017 08:48:35 +0530 Subject: [PATCH 042/171] Updated release notes Change-Id: Ia98aff420ea9d488924dce8fe9168cec9da301ab [ROCm/clr commit: c16e0fb08850141b1f379cc5e6ada2bffe657269] --- projects/clr/hipamd/RELEASE.md | 46 ++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/projects/clr/hipamd/RELEASE.md b/projects/clr/hipamd/RELEASE.md index 34eab60833..21fd8da7bb 100644 --- a/projects/clr/hipamd/RELEASE.md +++ b/projects/clr/hipamd/RELEASE.md @@ -13,6 +13,52 @@ Upcoming: ## Revision History: +=================================================================================================== +Release: 1.0.17102 +Date: 2017.03.07 +- Lots of improvements to hipify-clang. +- Added HIP package config for cmake. +- Several bug fixes and documentation updates. + + +=================================================================================================== +Release: 1.0.17066 +Date: 2017.02.11 +- Improved support for math device functions. +- Added several half math device functions. +- Enabled support for CUDA 8.0 in hipify-clang. +- Lots of bug fixes and documentation updates. + + +=================================================================================================== +Release: 1.0.17015 +Date: 2017.01.06 +- Several improvements to the hipify-clang infrastructure. +- Refactored module and function APIs. +- HIP now defaults to linking against the shared runtime library. +- Documentation updates. + + +=================================================================================================== +Release: 1.0.16502 +Date: 2016.12.13 +- Added several fast math and packaged math instrincs +- Improved debug and profiler documentation +- Support for building and linking to HIP shared library +- Several improvements to hipify-clang +- Several bug fixes + + +=================================================================================================== +Release: 1.0.16461 +Date: 2016.11.14 +- Significant changes to the HIP Profiling APIs. Refer to the documentation for details +- Improvements to P2P support +- New API: hipDeviceGetByPCIBusId +- Several bug fixes in NV path +- hipModuleLaunch now works for multi-dim kernels + + =================================================================================================== Release:1.0 Date: 2016.11.8 From 119f5a1e67d3b20cd9d46b1dc979970ab09d7ff2 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Mon, 24 Apr 2017 15:24:16 -0500 Subject: [PATCH 043/171] changed arguments for hipPointerGetAttributes Change-Id: Ia7a7c4722c1f7d0a23f0e5cc3dd6dea6c01c1fd8 [ROCm/clr commit: 85c189c846496f3b6fa331fd49419eebb830a82e] --- projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h | 4 ++-- projects/clr/hipamd/src/hip_memory.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index f9bfb5a310..80a0db7e2e 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -853,7 +853,7 @@ hipError_t hipEventQuery(hipEvent_t event) ; * * @see hipGetDeviceCount, hipGetDevice, hipSetDevice, hipChooseDevice */ -hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr); +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void* ptr); /** * @brief Allocate memory on the default accelerator @@ -1922,7 +1922,7 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); * @param [in] blockDimZ Z grid dimension specified in work-items * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED. * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. - * @param [in] kernelParams + * @param [in] kernelParams * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. * * @returns hipSuccess, hipInvalidDevice, hipErrorNotInitialized, hipErrorInvalidValue diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index da5530349f..f7421f9818 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -133,7 +133,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig //_appAllocationFlags : These are flags provided by the user when allocation is performed. They are returned to user in hipHostGetFlags and other APIs. // TODO - add more info here when available. // -hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, void* ptr) +hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void* ptr) { HIP_INIT_API(attributes, ptr); @@ -1268,7 +1268,7 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr); if(hsa_status != HSA_STATUS_SUCCESS) hipStatus = hipErrorMapBufferObjectFailed; - } + } #else hipStatus = hipErrorRuntimeOther; #endif From 8d9e22c9752067d0a0bf706ba4310b231d7fd4f3 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Mon, 24 Apr 2017 15:31:07 -0500 Subject: [PATCH 044/171] fixed build issues with hipPointerGetAttributes Change-Id: I3f5fbc05bdaef720884ba949075928752a070377 [ROCm/clr commit: 1f532b06f6ce0d38ae0ccae0839c0279437955a3] --- projects/clr/hipamd/src/hip_memory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index f7421f9818..b706426efb 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -149,10 +149,10 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void attributes->devicePointer = amPointerInfo._devicePointer; attributes->isManaged = 0; if(attributes->memoryType == hipMemoryTypeHost){ - attributes->hostPointer = ptr; + attributes->hostPointer = (void*)ptr; } if(attributes->memoryType == hipMemoryTypeDevice){ - attributes->devicePointer = ptr; + attributes->devicePointer = (void*)ptr; } attributes->allocationFlags = amPointerInfo._appAllocationFlags; attributes->device = amPointerInfo._appId; From 05c136e7b8ef105631d07f7a884c6cb74b7510ae Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 11:02:38 -0500 Subject: [PATCH 045/171] Fix hipMalloc to return error code if allocation fails. [ROCm/clr commit: fb7eee01ff1a46aaa9610d42b2ced80d4493e8b1] --- .../include/hip/hcc_detail/hip_runtime_api.h | 2 +- projects/clr/hipamd/src/hip_memory.cpp | 18 +++++++++++------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 80a0db7e2e..7a99ff0810 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -863,7 +863,7 @@ hipError_t hipPointerGetAttributes(hipPointerAttribute_t *attributes, const void * * If size is 0, no memory is allocated, *ptr returns nullptr, and hipSuccess is returned. * - * @return #hipSuccess + * @return #hipSuccess, #hipErrorMemoryAllocation, #hipErrorInvalidValue (bad context, null *ptr) * * @see hipMallocPitch, hipFree, hipMallocArray, hipFreeArray, hipMalloc3D, hipMalloc3DArray, hipHostFree, hipHostMalloc */ diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index b706426efb..821f64bc76 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -207,22 +207,26 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) HIP_INIT_API(ptr, sizeBytes); HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; + + auto ctx = ihipGetTlsDefaultCtx(); // return NULL pointer when malloc size is 0 if (sizeBytes == 0) { *ptr = NULL; - return ihipLogStatus(hipSuccess); - } + hip_status = hipSuccess; - auto ctx = ihipGetTlsDefaultCtx(); + } else if ((ctx==nullptr) || (ptr == nullptr)) { + hip_status = hipErrorInvalidValue; - if (ctx) { + } else { auto device = ctx->getWriteableDevice(); *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/); - } else { - hip_status = hipErrorMemoryAllocation; - } + if(sizeBytes && (*ptr == NULL)){ + hip_status = hipErrorMemoryAllocation; + } + + } return ihipLogStatus(hip_status); From eef8282c967da82f36eaea058dae3a7c4b4d457c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 11:03:32 -0500 Subject: [PATCH 046/171] Fix hip debug for case where copyAgent is null (host-to-host) [ROCm/clr commit: dfacfbb6412edbf5d5a2ae7c3eff80a900c3dd55] --- projects/clr/hipamd/src/hip_hcc.cpp | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 35a3e11e71..080d700e63 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -1765,20 +1765,24 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, if (HIP_FORCE_P2P_HOST & 0x1) { *forceUnpinnedCopy = true; - tprintf (DB_COPY, "P2P. Copy engine (dev:%d agent=0x%lx) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + tprintf (DB_COPY, "Copy engine (dev:%d agent=0x%lx) can see src and dst but HIP_FORCE_P2P_HOST=0, forcing copy through staging buffers.\n", + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); } else { - tprintf (DB_COPY, "P2P. Copy engine (dev:%d agent=0x%lx) can see src and dst.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + tprintf (DB_COPY, "Copy engine (dev:%d agent=0x%lx) can see src and dst.\n", + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); } } else { *forceUnpinnedCopy = true; tprintf (DB_COPY, "P2P: Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); if (HIP_FAIL_SOC & 0x2) { fprintf (stderr, "HIP_FAIL_SOC: P2P: copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n", - (*copyDevice)->getDeviceNum(), (*copyDevice)->getDevice()->_hsaAgent.handle); + *copyDevice ? (*copyDevice)->getDeviceNum() : -1, + *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); throw ihipException(hipErrorRuntimeOther); } } From 88fa7488938a190280ca189cb52c1a9bcbcb662a Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 11:06:54 -0500 Subject: [PATCH 047/171] Add negative testing for memory full condition. [ROCm/clr commit: 693e5abc1cdc517c415d2188ccdf2fcbadb647a5] --- .../tests/src/runtimeApi/memory/hipMemoryAllocate.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index 1f7599491a..0a256d6362 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -56,5 +56,15 @@ int main(){ HIPCHECK_API(hipFree(NULL) , hipSuccess); HIPCHECK_API(hipHostFree(NULL) , hipSuccess); + + { + // Some negative testing - request a too-big allocation and verify it fails: + // Someday when we support virtual memory may need to refactor these: + size_t tooBig = 128LL*1024*1024*1024*1024; // 128 TB; + void *p; + HIPCHECK_API ( hipMalloc(&p, tooBig), hipErrorMemoryAllocation ); + HIPCHECK_API ( hipHostMalloc(&p, tooBig), hipErrorMemoryAllocation ); + } + passed(); } From 04082c14a7b4e94842ba0c964473ba1cce3ca1fb Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 11:53:31 -0500 Subject: [PATCH 048/171] Refactor hipMemcpy test to share mem alloc for multiple copies. [ROCm/clr commit: 6b16f56f38a34381f411e56412f2ce90eb091afe] --- projects/clr/hipamd/.vimrc | 1 - .../tests/src/runtimeApi/memory/hipMemcpy.cpp | 239 +++++++++++++----- 2 files changed, 182 insertions(+), 58 deletions(-) delete mode 100644 projects/clr/hipamd/.vimrc diff --git a/projects/clr/hipamd/.vimrc b/projects/clr/hipamd/.vimrc deleted file mode 100644 index 019afa57e6..0000000000 --- a/projects/clr/hipamd/.vimrc +++ /dev/null @@ -1 +0,0 @@ -:set makeprg=make\ -C\ build.hcc-LC.db diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp index a320a86022..d50a810a58 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -38,6 +38,130 @@ void printSep() printf ("======================================================================================\n"); } +//------- +template +class DeviceMemory +{ +public: + DeviceMemory(size_t numElements); + ~DeviceMemory(); +public: + T * A_d; + T* B_d; + T* C_d; + T* C_dd; + + size_t _maxNumElements; +}; + +template +DeviceMemory::DeviceMemory(size_t numElements) + : _maxNumElements(numElements) +{ + T ** np = nullptr; + HipTest::initArrays (&A_d, &B_d, &C_d, np, np, np, numElements, 0); + + + size_t sizeElements = numElements * sizeof(T); + + + HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); +} + + +template +DeviceMemory::~DeviceMemory () +{ + T * np = nullptr; + HipTest::freeArrays (A_d, B_d, C_d, np, np, np, 0); + + HIPCHECK (hipFree(C_dd)); + + C_dd = NULL; +}; + + + +//------- +template +class HostMemory +{ +public: + HostMemory(size_t numElements, bool usePinnedHost); + void reset(size_t numElements, bool full=false) ; + ~HostMemory(); +public: + // Host arrays + T * A_h; + T* B_h; + T* C_h; + + // Host arrays, secondary copy + T * A_hh; + T* B_hh; + + size_t _maxNumElements; + bool _usePinnedHost; +}; + +template +HostMemory::HostMemory(size_t numElements, bool usePinnedHost) + : _maxNumElements(numElements), + _usePinnedHost(usePinnedHost) +{ + T ** np = nullptr; + HipTest::initArrays (np, np, np, &A_h, &B_h, &C_h, numElements, usePinnedHost); + + A_hh = NULL; + B_hh = NULL; + + + size_t sizeElements = numElements * sizeof(T); + + if (usePinnedHost) { + HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) ); + HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) ); + } else { + A_hh = (T*)malloc(sizeElements); + B_hh = (T*)malloc(sizeElements); + } + +} + + +template +void +HostMemory::reset(size_t numElements, bool full) +{ + // Initialize the host data: + for (size_t i=0; i +HostMemory::~HostMemory () +{ + HipTest::freeArraysForHost (A_h, B_h, C_h, _usePinnedHost); + + if (_usePinnedHost) { + HIPCHECK (hipHostFree(A_hh)); + HIPCHECK (hipHostFree(B_hh)); + + } else { + free(A_hh); + free(B_hh); + } + T *A_hh = NULL; + T *B_hh = NULL; + +}; @@ -52,71 +176,55 @@ void printSep() // IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. // template -void memcpytest2(size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, - usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); - T *A_d, *B_d, *C_d; - T *A_h, *B_h, *C_h; - - - HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, numElements, usePinnedHost); + hmem->reset(numElements); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - T *A_hh = NULL; - T *B_hh = NULL; - T *C_dd = NULL; + assert (numElements <= dmem->_maxNumElements); + assert (numElements <= hmem->_maxNumElements); if (useHostToHost) { - if (usePinnedHost) { - HIPCHECK ( hipHostMalloc((void**)&A_hh, sizeElements, hipHostMallocDefault) ); - HIPCHECK ( hipHostMalloc((void**)&B_hh, sizeElements, hipHostMallocDefault) ); - } else { - A_hh = (T*)malloc(sizeElements); - B_hh = (T*)malloc(sizeElements); - } - - // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(A_hh, A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(B_hh, B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(hmem->A_hh, hmem->A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(A_d, A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { - HIPCHECK ( hipMemcpy(A_d, A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(B_d, B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d, dmem->B_d, dmem->C_d, numElements); if (useDeviceToDevice) { - HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + // Do an extra device-to-device copy here to mix things up: + HIPCHECK ( hipMemcpy(dmem->C_dd, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); - // Do an extra device-to-device copies here to mix things up: - HIPCHECK ( hipMemcpy(C_dd, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + //Destroy the original dmem->C_d: + HIPCHECK ( hipMemset(dmem->C_d, 0x5A, sizeElements)); - //Destroy the original C_d: - HIPCHECK ( hipMemset(C_d, 0x5A, sizeElements)); - - HIPCHECK ( hipMemcpy(C_h, C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { - HIPCHECK ( hipMemcpy(C_h, C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(A_h, B_h, C_h, numElements); + HipTest::checkVectorADD(hmem->A_h, hmem->B_h, hmem->C_h, numElements); + - HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, usePinnedHost); printf (" %s success\n", __func__); } @@ -129,11 +237,15 @@ void memcpytest2_for_type(size_t numElements) { printSep(); + DeviceMemory memD(numElements); + HostMemory memU(numElements, 0/*usePinnedHost*/); + HostMemory memP(numElements, 1/*usePinnedHost*/); + for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + memcpytest2(&memD, usePinnedHost ? &memP : &memU, numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); } } } @@ -156,17 +268,19 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) HIPCHECK(hipMemGetInfo(&free, &total)); if (maxElem == 0) { - maxElem = free/sizeof(T)/5; + maxElem = free/sizeof(T)/20; } printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + HIPCHECK ( hipDeviceReset() ); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 1/*usePinnedHost*/); for (size_t elem=64; elem+offset<=maxElem; elem*=2) { - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 0, 1, 1, 0); // unpinned host - HIPCHECK ( hipDeviceReset() ); - memcpytest2(elem+offset, 1, 1, 1, 0); // pinned host + memcpytest2(&memD, &memU, elem+offset, 0, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem+offset, 1, 1, 1, 0); // pinned host } } @@ -178,13 +292,17 @@ void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - std::thread t1 (memcpytest2,N, usePinnedHost,0,0,0); + DeviceMemory memD(N); + HostMemory mem1(N, usePinnedHost); + HostMemory mem2(N, usePinnedHost); + + std::thread t1 (memcpytest2, &memD, &mem1, N, usePinnedHost,0,0,0); if (serialize) { t1.join(); } - std::thread t2 (memcpytest2,N, usePinnedHost,0,0,0); + std::thread t2 (memcpytest2,&memD, &mem2, N, usePinnedHost,0,0,0); if (serialize) { t2.join(); } @@ -218,24 +336,30 @@ int main(int argc, char *argv[]) if (p_tests & 0x2) { - // Some tests around the 64MB boundary which have historically shown issues: - printf ("\n\n=== tests&0x2 (64MB boundary)\n"); -#if 0 + // Some tests around the 64KB boundary which have historically shown issues: + printf ("\n\n=== tests&0x2 (64KB boundary)\n"); + size_t maxElem = 32*1024*1024; + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 0/*usePinnedHost*/); // These all pass: - memcpytest2(15*1024*1024, 1, 0, 0, 0); - memcpytest2(16*1024*1024, 1, 0, 0, 0); - memcpytest2(16*1024*1024+16*1024, 1, 0, 0, 0); -#endif + memcpytest2(&memD, &memP, 15*1024*1024, 1, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024, 1, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 1, 0, 0, 0); + // Just over 64MB: - memcpytest2(16*1024*1024+512*1024, 1, 0, 0, 0); - memcpytest2(17*1024*1024+1024, 1, 0, 0, 0); - memcpytest2(32*1024*1024, 1, 0, 0, 0); - memcpytest2(32*1024*1024, 0, 0, 0, 0); - memcpytest2(32*1024*1024, 1, 1, 1, 0); - memcpytest2(32*1024*1024, 1, 1, 1, 0); + memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 1, 0, 0, 0); + memcpytest2(&memD, &memP, 17*1024*1024+1024, 1, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 0, 0, 0); + memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 1, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 1, 0); + + } + if (p_tests & 0x4) { printf ("\n\n=== tests&4 (test sizes and offsets)\n"); HIPCHECK ( hipDeviceReset() ); @@ -270,6 +394,7 @@ int main(int argc, char *argv[]) } + passed(); } From 37fcb3bd133f4689195b105f2ee0df61ddc2159e Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 12:51:17 -0500 Subject: [PATCH 049/171] Add corrected test for offsets [ROCm/clr commit: b44a3eefd17aaee1c79a77cfbf8d50060e907f86] --- .../tests/src/runtimeApi/memory/hipMemcpy.cpp | 168 ++++++++++++------ 1 file changed, 115 insertions(+), 53 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp index d50a810a58..ad798d70c1 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -24,6 +24,7 @@ THE SOFTWARE. * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 * RUN_NAMED: %t hipMemcpy-size --tests 0x6 + * RUN_NAMED: %t hipMemcpy-offsets --tests 0x10 * RUN_NAMED: %t hipMemcpy-multithreaded --tests 0x8 * HIT_END */ @@ -45,27 +46,42 @@ class DeviceMemory public: DeviceMemory(size_t numElements); ~DeviceMemory(); -public: - T * A_d; - T* B_d; - T* C_d; - T* C_dd; + + T *A_d() const { return _A_d + _offset; }; + T *B_d() const { return _B_d + _offset; }; + T *C_d() const { return _C_d + _offset; }; + T *C_dd() const { return _C_dd + _offset; }; + + size_t maxNumElements() const { return _maxNumElements; }; + + + void offset(int offset) { _offset = offset; }; + int offset() const { return _offset; }; + +private: + T * _A_d; + T* _B_d; + T* _C_d; + T* _C_dd; + size_t _maxNumElements; + int _offset; }; template DeviceMemory::DeviceMemory(size_t numElements) - : _maxNumElements(numElements) + : _maxNumElements(numElements), + _offset(0) { T ** np = nullptr; - HipTest::initArrays (&A_d, &B_d, &C_d, np, np, np, numElements, 0); + HipTest::initArrays (&_A_d, &_B_d, &_C_d, np, np, np, numElements, 0); size_t sizeElements = numElements * sizeof(T); - HIPCHECK ( hipMalloc(&C_dd, sizeElements) ); + HIPCHECK ( hipMalloc(&_C_dd, sizeElements) ); } @@ -73,11 +89,11 @@ template DeviceMemory::~DeviceMemory () { T * np = nullptr; - HipTest::freeArrays (A_d, B_d, C_d, np, np, np, 0); + HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0); - HIPCHECK (hipFree(C_dd)); + HIPCHECK (hipFree(_C_dd)); - C_dd = NULL; + _C_dd = NULL; }; @@ -90,6 +106,8 @@ public: HostMemory(size_t numElements, bool usePinnedHost); void reset(size_t numElements, bool full=false) ; ~HostMemory(); + + size_t maxNumElements() const { return _maxNumElements; }; public: // Host arrays T * A_h; @@ -176,21 +194,22 @@ HostMemory::~HostMemory () // IN: useMemkindDefault : If true, use memkinddefault (runtime figures out direction). if false, use explicit memcpy direction. // template -void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool usePinnedHost, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) +void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d\n", + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:%+d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, - hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault, + dmem->offset()); hmem->reset(numElements); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - assert (numElements <= dmem->_maxNumElements); - assert (numElements <= hmem->_maxNumElements); + assert (numElements <= dmem->maxNumElements()); + assert (numElements <= hmem->maxNumElements()); @@ -200,25 +219,25 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { - HIPCHECK ( hipMemcpy(dmem->A_d, hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(dmem->B_d, hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d, dmem->B_d, dmem->C_d, numElements); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements); if (useDeviceToDevice) { // Do an extra device-to-device copy here to mix things up: - HIPCHECK ( hipMemcpy(dmem->C_dd, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); + HIPCHECK ( hipMemcpy(dmem->C_dd(), dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyDeviceToDevice)); - //Destroy the original dmem->C_d: - HIPCHECK ( hipMemset(dmem->C_d, 0x5A, sizeElements)); + //Destroy the original dmem->C_d(): + HIPCHECK ( hipMemset(dmem->C_d(), 0x5A, sizeElements)); - HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { - HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d, sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); @@ -245,7 +264,7 @@ void memcpytest2_for_type(size_t numElements) for (int useHostToHost =0; useHostToHost<=1; useHostToHost++) { // TODO for (int useDeviceToDevice =0; useDeviceToDevice<=1; useDeviceToDevice++) { for (int useMemkindDefault =0; useMemkindDefault<=1; useMemkindDefault++) { - memcpytest2(&memD, usePinnedHost ? &memP : &memU, numElements, usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault); + memcpytest2(&memD, usePinnedHost ? &memP : &memU, numElements, useHostToHost, useDeviceToDevice, useMemkindDefault); } } } @@ -256,7 +275,7 @@ void memcpytest2_for_type(size_t numElements) //--- //Try many different sizes to memory copy. template -void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) +void memcpytest2_sizes(size_t maxElem=0) { printSep(); printf ("test: %s<%s>\n", __func__, TYPENAME(T)); @@ -268,19 +287,59 @@ void memcpytest2_sizes(size_t maxElem=0, size_t offset=0) HIPCHECK(hipMemGetInfo(&free, &total)); if (maxElem == 0) { - maxElem = free/sizeof(T)/20; + maxElem = free/sizeof(T)/5; } - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB offset=%lu\n", - deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0, offset); + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); DeviceMemory memD(maxElem); HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); - for (size_t elem=64; elem+offset<=maxElem; elem*=2) { - memcpytest2(&memD, &memU, elem+offset, 0, 1, 1, 0); // unpinned host - memcpytest2(&memD, &memP, elem+offset, 1, 1, 1, 0); // pinned host + for (size_t elem=1; elem<=maxElem; elem*=2) { + memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host + } +} + + +//--- +//Try many different sizes to memory copy. +template +void memcpytest2_offsets(size_t maxElem) +{ + printSep(); + printf ("test: %s<%s>\n", __func__, TYPENAME(T)); + + int deviceId; + HIPCHECK(hipGetDevice(&deviceId)); + + size_t free, total; + HIPCHECK(hipMemGetInfo(&free, &total)); + + + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); + HIPCHECK ( hipDeviceReset() ); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 1/*usePinnedHost*/); + + size_t elem = maxElem / 2; + + for (int offset=0; offset < 512; offset++) { + assert (elem + offset < maxElem); + memD.offset(offset); + memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host + } + + for (int offset=512; offset < maxElem; offset*=2) { + assert (elem + offset < maxElem); + memD.offset(offset); + memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host + memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host } } @@ -296,13 +355,13 @@ void multiThread_1(bool serialize, bool usePinnedHost) HostMemory mem1(N, usePinnedHost); HostMemory mem2(N, usePinnedHost); - std::thread t1 (memcpytest2, &memD, &mem1, N, usePinnedHost,0,0,0); + std::thread t1 (memcpytest2, &memD, &mem1, N, 0,0,0); if (serialize) { t1.join(); } - std::thread t2 (memcpytest2,&memD, &mem2, N, usePinnedHost,0,0,0); + std::thread t2 (memcpytest2,&memD, &mem2, N, 0,0,0); if (serialize) { t2.join(); } @@ -343,17 +402,17 @@ int main(int argc, char *argv[]) HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 0/*usePinnedHost*/); // These all pass: - memcpytest2(&memD, &memP, 15*1024*1024, 1, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024, 1, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 1, 0, 0, 0); + memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); // Just over 64MB: - memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 1, 0, 0, 0); - memcpytest2(&memD, &memP, 17*1024*1024+1024, 1, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 0, 0, 0); - memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 1, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 1, 0); + memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); } @@ -361,16 +420,19 @@ int main(int argc, char *argv[]) if (p_tests & 0x4) { - printf ("\n\n=== tests&4 (test sizes and offsets)\n"); + printf ("\n\n=== tests&4 (test sizes)\n"); HIPCHECK ( hipDeviceReset() ); + memcpytest2_sizes(0); printSep(); - memcpytest2_sizes(0,0); - printSep(); - memcpytest2_sizes(0,64); - printSep(); - memcpytest2_sizes(1024*1024, 13); - printSep(); - memcpytest2_sizes(1024*1024, 50); + } + + + if (p_tests & 0x10) { + printf ("\n\n=== tests&4 (test offsets)\n"); + HIPCHECK ( hipDeviceReset() ); + memcpytest2_offsets(256*1024*1024); + memcpytest2_offsets(256*1024*1024); + memcpytest2_offsets(256*1024*1024); } if (p_tests & 0x8) { From 67383ddb5c537bbcd5ae0aa333db22a4b011b410 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 16:02:22 -0500 Subject: [PATCH 050/171] Add test for non-page-aligned mem copies. [ROCm/clr commit: d120b2dd123829057736e34a43681998b21f4d4e] --- .../tests/src/runtimeApi/memory/hipMemcpy.cpp | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp index ad798d70c1..c48f780e44 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -24,7 +24,7 @@ THE SOFTWARE. * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 * RUN_NAMED: %t hipMemcpy-size --tests 0x6 - * RUN_NAMED: %t hipMemcpy-offsets --tests 0x10 + * RUN_NAMED: %t hipMemcpy-dev_offsets --tests 0x10 * RUN_NAMED: %t hipMemcpy-multithreaded --tests 0x8 * HIT_END */ @@ -335,7 +335,7 @@ void memcpytest2_offsets(size_t maxElem) memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host } - for (int offset=512; offset < maxElem; offset*=2) { + for (int offset=512; offset < elem; offset*=2) { assert (elem + offset < maxElem); memD.offset(offset); memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host @@ -427,13 +427,6 @@ int main(int argc, char *argv[]) } - if (p_tests & 0x10) { - printf ("\n\n=== tests&4 (test offsets)\n"); - HIPCHECK ( hipDeviceReset() ); - memcpytest2_offsets(256*1024*1024); - memcpytest2_offsets(256*1024*1024); - memcpytest2_offsets(256*1024*1024); - } if (p_tests & 0x8) { printf ("\n\n=== tests&8\n"); @@ -456,6 +449,16 @@ int main(int argc, char *argv[]) } + if (p_tests & 0x10) { + printf ("\n\n=== tests&0x10 (test device offsets)\n"); + HIPCHECK ( hipDeviceReset() ); + size_t maxSize = 256*1024; + memcpytest2_offsets (maxSize); + memcpytest2_offsets (maxSize); + memcpytest2_offsets(maxSize); + } + + passed(); From 0e042a1529ab3f1900951c9d895f570e3c6ac2cb Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 16:55:29 -0500 Subject: [PATCH 051/171] Tailor pointer info for src/dst before calling HCC copy routines. HCC sometimes uses the srcPtrInfo or dstPtrInfo to determine the pointer. Make sure these use the actual pointer and not the base of the allocation. [ROCm/clr commit: 3da8e94cbff8fb977a7186c09996fbcdb0b1d0fc] --- projects/clr/hipamd/src/hip_hcc.cpp | 66 +++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 080d700e63..71d947488d 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -1798,6 +1798,62 @@ void printPointerInfo(unsigned dbFlag, const char *tag, const void *ptr, const h } +// the pointer-info as returned by HC refers to the allocation +// This routine modifies the pointer-info so it appears to refer to the specific ptr and sizeBytes. +// TODO -remove this when HCC uses HSA pointer info functions directly. +void tailorPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes) +{ + const char *ptrc = static_cast (ptr); + if (ptrInfo->_sizeBytes == 0) { + // invalid ptrInfo, don't modify + return; + } else if (ptrInfo->_isInDeviceMem) { + assert (ptrInfo->_devicePointer != nullptr); + std::ptrdiff_t diff = ptrc - static_cast (ptrInfo->_devicePointer); + + //TODO : assert-> runtime assert that only appears in debug mode + assert (diff >= 0); + assert (diff <= ptrInfo->_sizeBytes); + + ptrInfo->_devicePointer = const_cast (ptr); + + if (ptrInfo->_hostPointer != nullptr) { + ptrInfo->_hostPointer = static_cast(ptrInfo->_hostPointer) + diff; + } + + } else { + + assert (ptrInfo->_hostPointer != nullptr); + std::ptrdiff_t diff = ptrc - static_cast (ptrInfo->_hostPointer); + + //TODO : assert-> runtime assert that only appears in debug mode + assert (diff >= 0); + assert (diff <= ptrInfo->_sizeBytes); + + ptrInfo->_hostPointer = const_cast(ptr); + + if (ptrInfo->_devicePointer != nullptr) { + ptrInfo->_devicePointer = static_cast(ptrInfo->_devicePointer) + diff; + } + } + + assert (sizeBytes <= ptrInfo->_sizeBytes); + ptrInfo->_sizeBytes = sizeBytes; +}; + + +bool getTailoredPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes) +{ + bool tracked = (hc::am_memtracker_getinfo(ptrInfo, ptr) == AM_SUCCESS); + + if (tracked) { + tailorPtrInfo(ptrInfo, ptr, sizeBytes); + } + + return tracked; +}; + + // TODO : For registered and host memory, if the portable flag is set, we need to recognize that and perform appropriate copy operation. // What can happen now is that Portable memory is mapped into multiple devices but Peer access is not enabled. i // The peer detection logic doesn't see that the memory is already mapped and so tries to use an unpinned copy algorithm. If this is PinInPlace, then an error can occur. @@ -1816,8 +1872,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, hc::accelerator acc; hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); - bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); - bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes); + bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes); // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not valid, so check it here: @@ -1877,6 +1933,7 @@ void ihipStream_t::lockedSymbolCopySync(hc::accelerator &acc, void* dst, void* s void ihipStream_t::lockedSymbolCopyAsync(hc::accelerator &acc, void* dst, void* src, size_t sizeBytes, size_t offset, unsigned kind) { + // TODO - review - this looks broken , should not be adding pointers to tracker dynamically: if(kind == hipMemcpyHostToDevice) { hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); @@ -1903,6 +1960,7 @@ void ihipStream_t::lockedSymbolCopyAsync(hc::accelerator &acc, void* dst, void* } } + void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes, unsigned kind) { @@ -1930,8 +1988,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes hc::accelerator acc; hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); - bool dstTracked = (hc::am_memtracker_getinfo(&dstPtrInfo, dst) == AM_SUCCESS); - bool srcTracked = (hc::am_memtracker_getinfo(&srcPtrInfo, src) == AM_SUCCESS); + bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes); + bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes); hc::hcCommandKind hcCopyDir; From aa4c89380eef37486ec482345b3ea47737916825 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 24 Apr 2017 20:38:37 -0500 Subject: [PATCH 052/171] Refactor hipHostRegister to cover misaligned cases. [ROCm/clr commit: 5ba167b82b7aed87fb1a3ae9ba25c3bfdc89925a] --- .../src/runtimeApi/memory/hipHostRegister.cpp | 129 ++++++++++++------ 1 file changed, 84 insertions(+), 45 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp index 1a1319c500..efa23b4068 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp @@ -19,87 +19,126 @@ THE SOFTWARE. /* HIT_START * BUILD: %t %s ../../test_common.cpp - * RUN: %t + * RUN: %t --tests 0x1 + * RUN: %t --tests 0x2 * HIT_END */ +// TODO - bug if run both back-to-back + #include"test_common.h" #include __global__ void Inc(hipLaunchParm lp, float *Ad){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; -Ad[tx] = Ad[tx] + float(1); + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + Ad[tx] = Ad[tx] + float(1); } -int main(){ - float *A, **Ad; - int num_devices; - HIPCHECK(hipGetDeviceCount(&num_devices)); - Ad = new float*[num_devices]; - const size_t size = N * sizeof(float); - A = (float*)malloc(size); - HIPCHECK(hipHostRegister(A, size, 0)); + +template +void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd) +{ + A = A + offset; + numElements -= offset; + + size_t sizeBytes = numElements * sizeof(T); + + HIPCHECK(hipHostRegister(A, sizeBytes, 0)); - for(int i=0;iOFFSETS_TO_TRY); + for (size_t i=0; i Date: Mon, 24 Apr 2017 21:05:29 -0500 Subject: [PATCH 053/171] Refactor hipHostRegister test. - Add more testing for offsets. - Parse cmdline options and use --tests. [ROCm/clr commit: e7af4ef641709ad8c91d4c9ebd701a4bb38d571e] --- .../src/runtimeApi/memory/hipHostRegister.cpp | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp index efa23b4068..8cf0979261 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp @@ -21,10 +21,11 @@ THE SOFTWARE. * BUILD: %t %s ../../test_common.cpp * RUN: %t --tests 0x1 * RUN: %t --tests 0x2 + * RUN: %t --tests 0x4 * HIT_END */ -// TODO - bug if run both back-to-back +// TODO - bug if run both back-to-back, once fixed should just need one command line #include"test_common.h" #include @@ -36,14 +37,16 @@ __global__ void Inc(hipLaunchParm lp, float *Ad){ template -void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd) +void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister) { A = A + offset; numElements -= offset; size_t sizeBytes = numElements * sizeof(T); - HIPCHECK(hipHostRegister(A, sizeBytes, 0)); + if (internalRegister) { + HIPCHECK(hipHostRegister(A, sizeBytes, 0)); + } // Reset @@ -67,7 +70,9 @@ void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd) }; } - HIPCHECK(hipHostUnregister(A)); + if (internalRegister) { + HIPCHECK(hipHostUnregister(A)); + } } @@ -112,7 +117,7 @@ int main(int argc, char *argv[]) } - if (p_tests & 0x2) { + if (p_tests & 0x6) { // Sensitize HIP bug if device does not match where the memory was registered. HIPCHECK(hipSetDevice(0)); @@ -125,11 +130,22 @@ int main(int argc, char *argv[]) Bh = (float*)malloc(size); HIPCHECK(hipMalloc(&Bd, size)); - // TODO - change to 256: + // TODO - set to 128 #define OFFSETS_TO_TRY 1 assert (N>OFFSETS_TO_TRY); - for (size_t i=0; i Date: Mon, 24 Apr 2017 21:22:56 -0500 Subject: [PATCH 054/171] Add host offset checking [ROCm/clr commit: fbf28a87281041dfbe2d6e27877dae0c51b2ae7a] --- .../tests/src/runtimeApi/memory/hipMemcpy.cpp | 91 +++++++++++++------ 1 file changed, 65 insertions(+), 26 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp index c48f780e44..749ec0de77 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -24,7 +24,8 @@ THE SOFTWARE. * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 * RUN_NAMED: %t hipMemcpy-size --tests 0x6 - * RUN_NAMED: %t hipMemcpy-dev_offsets --tests 0x10 + * RUN_NAMED: %t hipMemcpy-dev-offsets --tests 0x10 + * RUN_NAMED: %t hipMemcpy-host-offsets --tests 0x20 * RUN_NAMED: %t hipMemcpy-multithreaded --tests 0x8 * HIT_END */ @@ -107,28 +108,43 @@ public: void reset(size_t numElements, bool full=false) ; ~HostMemory(); + + T *A_h() const { return _A_h + _offset; }; + T *B_h() const { return _B_h + _offset; }; + T *C_h() const { return _C_h + _offset; }; + + + size_t maxNumElements() const { return _maxNumElements; }; + + void offset(int offset) { _offset = offset; }; + int offset() const { return _offset; }; public: - // Host arrays - T * A_h; - T* B_h; - T* C_h; // Host arrays, secondary copy T * A_hh; T* B_hh; - size_t _maxNumElements; bool _usePinnedHost; +private: + size_t _maxNumElements; + + int _offset; + + // Host arrays + T * _A_h; + T* _B_h; + T* _C_h; }; template HostMemory::HostMemory(size_t numElements, bool usePinnedHost) : _maxNumElements(numElements), - _usePinnedHost(usePinnedHost) + _usePinnedHost(usePinnedHost), + _offset(0) { T ** np = nullptr; - HipTest::initArrays (np, np, np, &A_h, &B_h, &C_h, numElements, usePinnedHost); + HipTest::initArrays (np, np, np, &_A_h, &_B_h, &_C_h, numElements, usePinnedHost); A_hh = NULL; B_hh = NULL; @@ -157,8 +173,8 @@ HostMemory::reset(size_t numElements, bool full) (B_hh)[i] = 1492.0 + i; // Phi if (full) { - (A_h)[i] = 3.146f + i; // Pi - (B_h)[i] = 1.618f + i; // Phi + (_A_h)[i] = 3.146f + i; // Pi + (_B_h)[i] = 1.618f + i; // Phi } } } @@ -166,7 +182,7 @@ HostMemory::reset(size_t numElements, bool full) template HostMemory::~HostMemory () { - HipTest::freeArraysForHost (A_h, B_h, C_h, _usePinnedHost); + HipTest::freeArraysForHost (_A_h, _B_h, _C_h, _usePinnedHost); if (_usePinnedHost) { HIPCHECK (hipHostFree(A_hh)); @@ -197,12 +213,13 @@ template void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:%+d\n", + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault, - dmem->offset()); + dmem->offset(), hmem->offset() + ); hmem->reset(numElements); @@ -215,15 +232,15 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, if (useHostToHost) { // Do some extra host-to-host copies here to mix things up: - HIPCHECK ( hipMemcpy(hmem->A_hh, hmem->A_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); - HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h, sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(hmem->A_hh, hmem->A_h(), sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); + HIPCHECK ( hipMemcpy(hmem->B_hh, hmem->B_h(), sizeElements, useMemkindDefault? hipMemcpyDefault : hipMemcpyHostToHost)); HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_hh, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } else { - HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); - HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h, sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->A_d(), hmem->A_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(dmem->B_d(), hmem->B_h(), sizeElements, useMemkindDefault ? hipMemcpyDefault : hipMemcpyHostToDevice)); } hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, dmem->A_d(), dmem->B_d(), dmem->C_d(), numElements); @@ -235,13 +252,13 @@ void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, //Destroy the original dmem->C_d(): HIPCHECK ( hipMemset(dmem->C_d(), 0x5A, sizeElements)); - HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_dd(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h(), dmem->C_dd(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } else { - HIPCHECK ( hipMemcpy(hmem->C_h, dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); + HIPCHECK ( hipMemcpy(hmem->C_h(), dmem->C_d(), sizeElements, useMemkindDefault? hipMemcpyDefault:hipMemcpyDeviceToHost)); } HIPCHECK ( hipDeviceSynchronize() ); - HipTest::checkVectorADD(hmem->A_h, hmem->B_h, hmem->C_h, numElements); + HipTest::checkVectorADD(hmem->A_h(), hmem->B_h(), hmem->C_h(), numElements); @@ -307,7 +324,7 @@ void memcpytest2_sizes(size_t maxElem=0) //--- //Try many different sizes to memory copy. template -void memcpytest2_offsets(size_t maxElem) +void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) { printSep(); printf ("test: %s<%s>\n", __func__, TYPENAME(T)); @@ -330,14 +347,26 @@ void memcpytest2_offsets(size_t maxElem) for (int offset=0; offset < 512; offset++) { assert (elem + offset < maxElem); - memD.offset(offset); + if (devOffsets) { + memD.offset(offset); + } + if (hostOffsets) { + memU.offset(offset); + memP.offset(offset); + } memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host } for (int offset=512; offset < elem; offset*=2) { assert (elem + offset < maxElem); - memD.offset(offset); + if (devOffsets) { + memD.offset(offset); + } + if (hostOffsets) { + memU.offset(offset); + memP.offset(offset); + } memcpytest2(&memD, &memU, elem, 1, 1, 0); // unpinned host memcpytest2(&memD, &memP, elem, 1, 1, 0); // pinned host } @@ -453,9 +482,19 @@ int main(int argc, char *argv[]) printf ("\n\n=== tests&0x10 (test device offsets)\n"); HIPCHECK ( hipDeviceReset() ); size_t maxSize = 256*1024; - memcpytest2_offsets (maxSize); - memcpytest2_offsets (maxSize); - memcpytest2_offsets(maxSize); + memcpytest2_offsets (maxSize, true, false); + memcpytest2_offsets (maxSize, true, false); + memcpytest2_offsets(maxSize, true, false); + } + + + if (p_tests & 0x20) { + printf ("\n\n=== tests&0x10 (test device offsets)\n"); + HIPCHECK ( hipDeviceReset() ); + size_t maxSize = 256*1024; + memcpytest2_offsets (maxSize, false, true); + memcpytest2_offsets (maxSize, false, true); + memcpytest2_offsets(maxSize, false, true); } From a82ecbcc131383f518f8153040caf60af73a7c67 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Tue, 25 Apr 2017 00:13:32 -0500 Subject: [PATCH 055/171] fix hip_complex.h header on NV path Change-Id: Ia95d003ca1b284bab1c76723050e6b3b89178f65 [ROCm/clr commit: 50daa408aa71a546c8b95998542ac81612f9a5d7] --- projects/clr/hipamd/include/hip/nvcc_detail/hip_complex.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_complex.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_complex.h index 84afb13e50..20cb24460c 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_complex.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_complex.h @@ -64,7 +64,7 @@ __device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hi } __device__ __host__ static inline float hipCabsf(hipFloatComplex z){ - return cuCabsf(p, q); + return cuCabsf(z); } typedef cuDoubleComplex hipDoubleComplex; @@ -85,7 +85,7 @@ __device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z){ return cuConj(z); } -__device__ __host__ static inline hipDoubleComplex hipCsqabs(hipDoubleComplex z){ +__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z){ return cuCabs(z) * cuCabs(z); } @@ -123,7 +123,7 @@ __device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q return cuCfmaf(p, q, r); } -__device__ __host__ static inline hipDoubleComplex hipCfma(hipComplex p, hipComplex q, hipComplex r){ +__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q, hipDoubleComplex r){ return cuCfma(p, q, r); } From afec5e075b52801073bc0f2ebdcdf5ebeeb40983 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 26 Apr 2017 18:56:57 -0500 Subject: [PATCH 056/171] added hipFuncSetCacheConfig API for nvcc path Change-Id: I87fae35bc0e10a0dca5ae1c5015fe5d9e52a1d0d [ROCm/clr commit: e91c35fde06d0f9a40a6a25ddfb43c21f2ede7b3] --- .../include/hip/nvcc_detail/hip_runtime_api.h | 4 +++ .../module/hipFuncSetCacheConfig.cpp | 36 +++++++++++++++++++ 2 files changed, 40 insertions(+) create mode 100644 projects/clr/hipamd/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index 0cc40f32af..4feefcc342 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -897,6 +897,10 @@ inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, } +inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) +{ + return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig)); +} #ifdef __cplusplus } diff --git a/projects/clr/hipamd/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp b/projects/clr/hipamd/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp new file mode 100644 index 0000000000..e3c3efad3d --- /dev/null +++ b/projects/clr/hipamd/tests/src/runtimeApi/module/hipFuncSetCacheConfig.cpp @@ -0,0 +1,36 @@ +/* +Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + +#include +#include +#include"test_common.h" + +int main(){ + hipFuncCache_t cacheConfig; + void *func; + hipFuncSetCacheConfig(func, cacheConfig); + passed(); +} + From c662b426dca1835cf14b498af08b15496b0717ab Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Wed, 26 Apr 2017 19:01:10 -0500 Subject: [PATCH 057/171] fixed fast math expf and exp10f Change-Id: I73963220f902efebb0a7404c5f8966dffb4c35ca [ROCm/clr commit: f368271872827aebb83c7c1af742d1b280847513] --- projects/clr/hipamd/src/device_util.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/src/device_util.cpp b/projects/clr/hipamd/src/device_util.cpp index 8ce53765b5..b730412874 100644 --- a/projects/clr/hipamd/src/device_util.cpp +++ b/projects/clr/hipamd/src/device_util.cpp @@ -1163,18 +1163,18 @@ __device__ double __hip_precise_dsqrt_rz(double x) { return hc::precise_math::sqrt(x); } -#define LOG_BASE2_E_DIV_2 0.4426950408894701 -#define LOG_BASE2_5 2.321928094887362 +#define LOG_BASE2_E 1.4426950408889634 +#define LOG_BASE2_10 3.32192809488736 #define ONE_DIV_LOG_BASE2_E 0.69314718056 #define ONE_DIV_LOG_BASE2_10 0.30102999566 // Fast Math Intrinsics __device__ float __hip_fast_exp10f(float x) { - return __hip_fast_exp2f(x*LOG_BASE2_E_DIV_2); + return __hip_fast_exp2f(x*LOG_BASE2_E); } __device__ float __hip_fast_expf(float x) { - return __hip_fast_expf(x*LOG_BASE2_5); + return __hip_fast_exp2f(x*LOG_BASE2_10); } __device__ float __hip_fast_frsqrt_rn(float x) { From 6dccc18fa0b5f350589a8bf8332588e81c48bffb Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 28 Apr 2017 11:53:11 -0500 Subject: [PATCH 058/171] fixed hipFuncSetCacheConfig on rocm path Change-Id: I937a3afbf115edc94a753a0beb2230ed60a6f021 [ROCm/clr commit: 208bdbbcbb84e420bc8d31f43a325d946515faf1] --- projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h | 2 +- projects/clr/hipamd/src/hip_device.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 7a99ff0810..6917f04f96 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -385,7 +385,7 @@ hipError_t hipDeviceGetLimit(size_t *pValue, enum hipLimit_t limit); * Note: AMD devices and recent Nvidia GPUS do not support reconfigurable cache. This hint is ignored on those architectures. * */ -hipError_t hipFuncSetCacheConfig ( hipFuncCache_t config ); +hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t config ); /** * @brief Returns bank width of shared memory for current device diff --git a/projects/clr/hipamd/src/hip_device.cpp b/projects/clr/hipamd/src/hip_device.cpp index 88d94411e8..01a213190f 100644 --- a/projects/clr/hipamd/src/hip_device.cpp +++ b/projects/clr/hipamd/src/hip_device.cpp @@ -112,7 +112,7 @@ hipError_t hipDeviceGetLimit (size_t *pValue, hipLimit_t limit) } } -hipError_t hipFuncSetCacheConfig (hipFuncCache_t cacheConfig) +hipError_t hipFuncSetCacheConfig (const void* func, hipFuncCache_t cacheConfig) { HIP_INIT_API(cacheConfig); From 3e06497eaad2084d437b05093cc7e25399c523b3 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Apr 2017 21:03:03 +0300 Subject: [PATCH 059/171] [HIPIFY] [FIX] replacement error: cudaError_t -> hipError_t_t https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/71 [Solution] getUnqualifiedType for enumConstantDecl's type is added, except ordinary enum declarations (w/o typedef). [ToDo] Find more appropriate way of distinguishing redefined enum declarations and ordinary ones. [ROCm/clr commit: 5f76cf3098d41f8e596899549f844020a6a29cd4] --- projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 5a2940322e..f0fa8331dc 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -191,7 +191,7 @@ struct cuda2hipMap { // Error codes and return types cuda2hipRename["CUresult"] = {"hipError_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["cudaError_enum"] = {"hipError_t", CONV_TYPE, API_DRIVER}; +// cuda2hipRename["cudaError_enum"] = {"hipError_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["cudaError_t"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; cuda2hipRename["cudaError"] = {"hipError_t", CONV_TYPE, API_RUNTIME}; @@ -2806,12 +2806,11 @@ private: bool cudaEnumConstantDecl(const MatchFinder::MatchResult &Result) { if (const VarDecl *enumConstantDecl = Result.Nodes.getNodeAs("cudaEnumConstantDecl")) { - StringRef name = - enumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); - // anonymous typedef enum - if (name.empty()) { - QualType QT = enumConstantDecl->getType().getUnqualifiedType(); - name = QT.getAsString(); + StringRef name = enumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); + QualType QT = enumConstantDecl->getType().getUnqualifiedType(); + StringRef name_unqualified = QT.getAsString(); + if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { + name = name_unqualified; } SourceLocation sl = enumConstantDecl->getLocStart(); SourceManager *SM = Result.SourceManager; From 3252416883eeb2b73e7fe1eca8307845ea165b38 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Apr 2017 21:45:36 +0300 Subject: [PATCH 060/171] * [HIPIFY] [FIX] Replacement error: enum cudaMemcpyKind kind -> hipMemcpyKindyKind kind https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/72 [Solution] [Workaround] Offset calculation for enum VarDecl as param decl, declared with enum type specifier. [Result] enum cudaMemcpyKind kind -> enum hipMemcpyKind kind [ToDo] Test on terminal qualifiers (const, etc). [ROCm/clr commit: 85a32c3987250ec8374296140463fb12547be2b2] --- projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index f0fa8331dc..35c930f3af 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -2812,8 +2812,19 @@ private: if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { name = name_unqualified; } - SourceLocation sl = enumConstantDecl->getLocStart(); + // Workaround for enum VarDecl as param decl, declared with enum type specifier + // Example: void func(enum cudaMemcpyKind kind); + //------------------------------------------------- SourceManager *SM = Result.SourceManager; + SourceLocation sl(enumConstantDecl->getLocStart()); + SourceLocation end(enumConstantDecl->getLocEnd()); + size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl); + StringRef sfull = StringRef(SM->getCharacterData(sl), repLength); + size_t offset = sfull.find(name); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + //------------------------------------------------- const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { updateCounters(found->second, name.str()); From 03595a98838e51aae91621f15060c30a38bb557c Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 28 Apr 2017 21:59:33 +0300 Subject: [PATCH 061/171] [HIPIFY] Rename enumConstantDecl -> enumDecl Reason: not to mix up with clang's enumConstantDecl, used for enum DeclRefExpr (enum constant). [ROCm/clr commit: 3e89277a9ba6127592b7168eff399d46bfbce95a] --- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 35c930f3af..390b4ee88c 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -2804,10 +2804,10 @@ private: return false; } - bool cudaEnumConstantDecl(const MatchFinder::MatchResult &Result) { - if (const VarDecl *enumConstantDecl = Result.Nodes.getNodeAs("cudaEnumConstantDecl")) { - StringRef name = enumConstantDecl->getType()->getAsTagDecl()->getNameAsString(); - QualType QT = enumConstantDecl->getType().getUnqualifiedType(); + bool cudaEnumDecl(const MatchFinder::MatchResult &Result) { + if (const VarDecl *enumDecl = Result.Nodes.getNodeAs("cudaEnumDecl")) { + StringRef name = enumDecl->getType()->getAsTagDecl()->getNameAsString(); + QualType QT = enumDecl->getType().getUnqualifiedType(); StringRef name_unqualified = QT.getAsString(); if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { name = name_unqualified; @@ -2816,8 +2816,8 @@ private: // Example: void func(enum cudaMemcpyKind kind); //------------------------------------------------- SourceManager *SM = Result.SourceManager; - SourceLocation sl(enumConstantDecl->getLocStart()); - SourceLocation end(enumConstantDecl->getLocEnd()); + SourceLocation sl(enumDecl->getLocStart()); + SourceLocation end(enumDecl->getLocEnd()); size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl); StringRef sfull = StringRef(SM->getCharacterData(sl), repLength); size_t offset = sfull.find(name); @@ -3123,7 +3123,7 @@ public: if (cudaCall(Result)) break; if (cudaBuiltin(Result)) break; if (cudaEnumConstantRef(Result)) break; - if (cudaEnumConstantDecl(Result)) break; + if (cudaEnumDecl(Result)) break; if (cudaTypedefVar(Result)) break; if (cudaTypedefVarPtr(Result)) break; if (cudaStructVar(Result)) break; @@ -3169,7 +3169,7 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac Callback); Finder.addMatcher(varDecl(isExpansionInMainFile(), hasType(enumDecl())) - .bind("cudaEnumConstantDecl"), + .bind("cudaEnumDecl"), Callback); Finder.addMatcher(varDecl(isExpansionInMainFile(), hasType(typedefDecl(matchesName("cu.*|CU.*")))) From 9739d5d60f70766b5cf2b8ab331e59d669506417 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Wed, 3 May 2017 22:29:12 +0530 Subject: [PATCH 062/171] Added support for hipMemcpy2DAsync in HIP/HCC Change-Id: Ia4a8306f2dc1e33a81a7195ec29aef652fcccc4b [ROCm/clr commit: b136e80a450d8b5521c61ec76bb7f3b6a5174a41] --- .../include/hip/hcc_detail/hip_runtime_api.h | 21 +++++++++++++++++++ projects/clr/hipamd/src/hip_memory.cpp | 18 ++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 6917f04f96..9cfd21c1d2 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -1308,6 +1308,27 @@ hipError_t hipFreeArray(hipArray* array); */ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind); +/** + * @brief Copies data between host and device. + * + * @param[in] dst Destination memory address + * @param[in] dpitch Pitch of destination memory + * @param[in] src Source memory address + * @param[in] spitch Pitch of source memory + * @param[in] width Width of matrix transfer (columns in bytes) + * @param[in] height Height of matrix transfer (rows) + * @param[in] kind Type of transfer + * @param[in] stream Stream to use + * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorInvalidPitchValue, #hipErrorInvalidDevicePointer, #hipErrorInvalidMemcpyDirection + * + * @see hipMemcpy, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpyToSymbol, hipMemcpyAsync + */ +#if __cplusplus +hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream = 0); +#else +hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream); +#endif + /** * @brief Copies data between host and device. * diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 821f64bc76..c4bc7db096 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -793,6 +793,24 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, return ihipLogStatus(e); } +hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, + size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { + HIP_INIT_CMD_API(dst, dpitch, src, spitch, width, height, kind, stream); + if(width > dpitch || width > spitch) + return ihipLogStatus(hipErrorUnknown); + hipError_t e = hipSuccess; + try { + for(int i = 0; i < height; ++i) { + e = hip_internal::memcpyAsync((unsigned char*)dst + i*dpitch, (unsigned char*)src + i*spitch, width, kind,stream); + } + } + catch (ihipException ex) { + e = ex._code; + } + + return ihipLogStatus(e); +} + hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { From 3ca552c6696499b91aad0a01be28d2ebcba82b6b Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 3 May 2017 21:59:48 +0300 Subject: [PATCH 063/171] [HIPIFY] HIPIFY and HIP sync with CUDA Driver API data types. + Update CUDA_Driver_API_functions_supported_by_HIP.md. + Final update of HIPIFY with CUDA driver data types. [TODO] Syncing HIPIFY and HIP by CUDA Driver API functions. [ROCm/clr commit: 3b407762442b785b9211a43cb1e02a15f7170ae2] --- ...A_Driver_API_functions_supported_by_HIP.md | 62 ++++ .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 324 +++++++++++------- 2 files changed, 253 insertions(+), 133 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index 3434d29a70..ad9d791a6d 100644 --- a/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -323,6 +323,68 @@ | 500 |*`CUDA_ERROR_NOT_FOUND`* |*`hipErrorNotFound`* | This indicates that a named symbol was not found. Examples of symbols are global/constant variable names, texture names, and surface names. | | 600 |*`CUDA_ERROR_NOT_READY`* |*`hipErrorNotReady`* | This indicates that asynchronous operations issued previously have not completed yet. This result is not actually an error, but must be indicated differently than CUDA_SUCCESS (which indicates completion). Calls that may return this value include cuEventQuery() and cuStreamQuery(). | | 700 |*`CUDA_ERROR_ILLEGAL_ADDRESS`* |*`hipErrorIllegalAddress`* | While executing a kernel, the device encountered a load or store instruction on an invalid memory address. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 701 |*`CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES`* |*`hipErrorLaunchOutOfResources`* | This indicates that a launch did not occur because it did not have appropriate resources. This error usually indicates that the user has attempted to pass too many arguments to the device kernel, or the kernel launch specifies too many threads for the kernel's register count. Passing arguments of the wrong size (i.e. a 64-bit pointer when a 32-bit int is expected) is equivalent to passing too many arguments and can also result in this error. | +| 702 |*`CUDA_ERROR_LAUNCH_TIMEOUT`* |*`hipErrorLaunchTimeOut`* | This indicates that the device kernel took too long to execute. This can only occur if timeouts are enabled - see the device attribute CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT for more information. The context cannot be used (and must be destroyed similar to CUDA_ERROR_LAUNCH_FAILED). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 703 |*`CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING`* | | This error indicates a kernel launch that uses an incompatible texturing mode. | +| 704 |*`CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED`* |*`hipErrorPeerAccessAlreadyEnabled`* | This error indicates that a call to cuCtxEnablePeerAccess() is trying to re-enable peer access to a context which has already had peer access to it enabled. | +| 705 |*`CUDA_ERROR_PEER_ACCESS_NOT_ENABLED`* |*`hipErrorPeerAccessNotEnabled`* | This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess(). | +| 708 |*`CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE`* | | This error indicates that cuCtxDisablePeerAccess() is trying to disable peer access which has not been enabled yet via cuCtxEnablePeerAccess(). | +| 709 |*`CUDA_ERROR_CONTEXT_IS_DESTROYED`* | | This error indicates that the context current to the calling thread has been destroyed using cuCtxDestroy, or is a primary context which has not yet been initialized. | +| 710 |*`CUDA_ERROR_ASSERT`* | | A device-side assert triggered during kernel execution. The context cannot be used anymore, and must be destroyed. All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 711 |*`CUDA_ERROR_TOO_MANY_PEERS`* | | This error indicates that the hardware resources required to enable peer access have been exhausted for one or more of the devices passed to cuCtxEnablePeerAccess(). | +| 712 |*`CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED`* |*`hipErrorHostMemoryAlreadyRegistered`* | This error indicates that the memory range passed to cuMemHostRegister() has already been registered. | +| 713 |*`CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED`* |*`hipErrorHostMemoryNotRegistered`* | This error indicates that the pointer passed to cuMemHostUnregister() does not correspond to any currently registered memory region. | +| 714 |*`CUDA_ERROR_HARDWARE_STACK_ERROR`* | | While executing a kernel, the device encountered a stack error. This can be due to stack corruption or exceeding the stack size limit. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 715 |*`CUDA_ERROR_ILLEGAL_INSTRUCTION`* | | While executing a kernel, the device encountered an illegal instruction. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 716 |*`CUDA_ERROR_MISALIGNED_ADDRESS`* | | While executing a kernel, the device encountered a load or store instruction on a memory address which is not aligned. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 717 |*`CUDA_ERROR_INVALID_ADDRESS_SPACE`* | | While executing a kernel, the device encountered an instruction which can only operate on memory locations in certain address spaces (global, shared, or local), but was supplied a memory address not belonging to an allowed address space. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 718 |*`CUDA_ERROR_INVALID_PC`* | | While executing a kernel, the device program counter wrapped its address space. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 719 |*`CUDA_ERROR_LAUNCH_FAILED`* | | An exception occurred on the device while executing a kernel. Common causes include dereferencing an invalid device pointer and accessing out of bounds shared memory. The context cannot be used, so it must be destroyed (and a new one should be created). All existing device memory allocations from this context are invalid and must be reconstructed if the program is to continue using CUDA. | +| 800 |*`CUDA_ERROR_NOT_PERMITTED`* | | This error indicates that the attempted operation is not permitted. | +| 801 |*`CUDA_ERROR_NOT_SUPPORTED`* | | This error indicates that the attempted operation is not supported on the current system or device. | +| 999 |*`CUDA_ERROR_UNKNOWN`* | | This indicates that an unknown internal error has occurred. | +| enum |***`CUstream_flags`*** |***`hipStreamFlags`*** | Stream creation flags | +| 0x0 |*`CU_STREAM_DEFAULT`* |*`hipStreamDefault`* | Default stream flag | +| 0x1 |*`CU_STREAM_NON_BLOCKING`* |*`hipStreamNonBlocking`* | Stream does not synchronize with stream 0 (the NULL stream) | +| typedef | `CUarray` | `hipArray *` | CUDA array | +| struct | `CUarray_st` | `hipArray` | CUDA array | +| typedef | `CUcontext` | `hipCtx_t` | CUDA context | +| typedef | `CUdevice` | `hipDevice_t` | CUDA device | +| typedef | `CUdeviceptr` | `hipDeviceptr_t` | CUDA device pointer CUdeviceptr is defined as an unsigned integer type whose size matches the size of a pointer on the target platform. | +| typedef | `CUevent` | `hipEvent_t` | CUDA event | +| typedef | `CUfunction` | `hipFunction_t` | CUDA function | +| typedef | `CUgraphicsResource` | | CUDA graphics interop resource | +| typedef | `CUmipmappedArray` | | CUDA mipmapped array | +| typedef | `CUmodule` | `hipModule_t` | CUDA module | +| typedef | `CUstream` | `hipStream_t` | CUDA module | +| typedef | `CUstreamCallback` | `hipStreamCallback_t` | CUDA stream callback | +| typedef | `CUsurfObject` | | An opaque value that represents a CUDA surface object | +| typedef | `CUsurfref` | | CUDA surface reference | +| typedef | `CUtexObject` | | An opaque value that represents a CUDA texture object | +| typedef | `CUtexref` | | CUDA texture reference | +| define |`CU_IPC_HANDLE_SIZE` | | CUDA IPC handle size. | +| define |`CU_LAUNCH_PARAM_BUFFER_POINTER` | `HIP_LAUNCH_PARAM_BUFFER_POINTER` | Indicator that the next value in the extra parameter to cuLaunchKernel will be a pointer to a buffer containing all kernel parameters used for launching kernel f. This buffer needs to honor all alignment/padding requirements of the individual parameters. If CU_LAUNCH_PARAM_BUFFER_SIZE is not also specified in the extra array, then CU_LAUNCH_PARAM_BUFFER_POINTER will have no effect. | +| define |`CU_LAUNCH_PARAM_BUFFER_SIZE` | `HIP_LAUNCH_PARAM_BUFFER_SIZE` | Indicator that the next value in the extra parameter to cuLaunchKernel will be a pointer to a size_t which contains the size of the buffer specified with CU_LAUNCH_PARAM_BUFFER_POINTER. It is required that CU_LAUNCH_PARAM_BUFFER_POINTER also be specified in the extra array if the value associated with CU_LAUNCH_PARAM_BUFFER_SIZE is not zero. | +| define |`CU_LAUNCH_PARAM_END` | `HIP_LAUNCH_PARAM_END` | End of array terminator for the extra parameter to cuLaunchKernel. | +| define |`CU_MEMHOSTALLOC_DEVICEMAP` | | If set, host memory is mapped into CUDA address space and cuMemHostGetDevicePointer() may be called on the host pointer. Flag for cuMemHostAlloc(). | +| define |`CU_MEMHOSTALLOC_PORTABLE` | | If set, host memory is portable between CUDA contexts. Flag for cuMemHostAlloc(). | +| define |`CU_MEMHOSTALLOC_WRITECOMBINED` | | If set, host memory is allocated as write-combined - fast to write, faster to DMA, slow to read except via SSE4 streaming load instruction (MOVNTDQA). Flag for cuMemHostAlloc(). | +| define |`CU_MEMHOSTREGISTER_DEVICEMAP` | | If set, host memory is mapped into CUDA address space and cuMemHostGetDevicePointer() may be called on the host pointer. Flag for cuMemHostRegister(). | +| define |`CU_MEMHOSTREGISTER_IOMEMORY` | | If set, the passed memory pointer is treated as pointing to some memory-mapped I/O space, e.g. belonging to a third-party PCIe device. On Windows the flag is a no-op. On Linux that memory is marked as non cache-coherent for the GPU and is expected to be physically contiguous. It may return CUDA_ERROR_NOT_PERMITTED if run as an unprivileged user, CUDA_ERROR_NOT_SUPPORTED on older Linux kernel versions. On all other platforms, it is not supported and CUDA_ERROR_NOT_SUPPORTED is returned. Flag for cuMemHostRegister(). | +| define |`CU_MEMHOSTREGISTER_PORTABLE` | | If set, host memory is portable between CUDA contexts. Flag for cuMemHostRegister(). | +| define |`CU_PARAM_TR_DEFAULT` | | For texture references loaded into the module, use default texunit from texture reference. | +| define |`CU_STREAM_LEGACY` | | Legacy stream handle. Stream handle that can be passed as a CUstream to use an implicit stream with legacy synchronization behavior. See details of the synchronization behavior. | +| define |`CU_STREAM_PER_THREAD` | | Per-thread stream handle. Stream handle that can be passed as a CUstream to use an implicit stream with perthread synchronization behavior. See details of the synchronization behavior. | +| define |`CU_TRSA_OVERRIDE_FORMAT` | | Override the texref format with a format inferred from the array. Flag for cuTexRefSetArray(). | +| define |`CU_TRSF_NORMALIZED_COORDINATES` | | Use normalized texture coordinates in the range [0,1) instead of [0,dim). Flag for cuTexRefSetFlags(). | +| define |`CU_TRSF_SRGB` | | Perform sRGB->linear conversion during texture read. Flag for cuTexRefSetFlags(). | +| define |`CUDA_ARRAY3D_2DARRAY` | | Deprecated, use CUDA_ARRAY3D_LAYERED. | +| define |`CUDA_ARRAY3D_CUBEMAP` | | If set, the CUDA array is a collection of six 2D arrays, representing faces of a cube. The width of such a CUDA array must be equal to its height, and Depth must be six. If CUDA_ARRAY3D_LAYERED flag is also set, then the CUDA array is a collection of cubemaps and Depth must be a multiple of six. | +| define |`CUDA_ARRAY3D_DEPTH_TEXTURE` | | This flag if set indicates that the CUDA array is a DEPTH_TEXTURE. | +| define |`CUDA_ARRAY3D_LAYERED` | | If set, the CUDA array is a collection of layers, where each layer is either a 1D or a 2D array and the Depth member of CUDA_ARRAY3D_DESCRIPTOR specifies the number of layers, not the depth of a 3D array. | +| define |`CUDA_ARRAY3D_SURFACE_LDST` | | This flag must be set in order to bind a surface reference to the CUDA array. | +| define |`CUDA_ARRAY3D_TEXTURE_GATHER` | | This flag must be set in order to perform texture gather operations on a CUDA array. | +| define |`CUDA_VERSION` | | CUDA API version number. | ## **2. Error Handling** diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 390b4ee88c..b3b2b993e3 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -208,59 +208,55 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 + cuda2hipRename["CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"] = {"hipErrorLaunchIncompatibleTexturing", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 703 + cuda2hipRename["CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"] = {"hipErrorPrimaryContextActive", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 708 + cuda2hipRename["CUDA_ERROR_CONTEXT_IS_DESTROYED"] = {"hipErrorContextIsDestroyed", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 709 + cuda2hipRename["CUDA_ERROR_NOT_PERMITTED"] = {"hipErrorNotPermitted", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 800 + cuda2hipRename["CUDA_ERROR_NOT_SUPPORTED"] = {"hipErrorNotSupported", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 801 // CUDA RT API error code only - cuda2hipRename["cudaErrorMissingConfiguration"] = {"hipErrorMissingConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["cudaErrorPriorLaunchFailure"] = {"hipErrorPriorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 5 - cuda2hipRename["cudaErrorInvalidDeviceFunction"] = {"hipErrorInvalidDeviceFunction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 8 - cuda2hipRename["cudaErrorInvalidConfiguration"] = {"hipErrorInvalidConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 9 - cuda2hipRename["cudaErrorInvalidPitchValue"] = {"hipErrorInvalidPitchValue", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 12 - cuda2hipRename["cudaErrorInvalidSymbol"] = {"hipErrorInvalidSymbol", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 13 - cuda2hipRename["cudaErrorInvalidHostPointer"] = {"hipErrorInvalidHostPointer", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 16 - cuda2hipRename["cudaErrorInvalidDevicePointer"] = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; // 17 - cuda2hipRename["cudaErrorInvalidTexture"] = {"hipErrorInvalidTexture", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 18 - cuda2hipRename["cudaErrorInvalidTextureBinding"] = {"hipErrorInvalidTextureBinding", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 19 - cuda2hipRename["cudaErrorInvalidChannelDescriptor"] = {"hipErrorInvalidChannelDescriptor", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 20 - cuda2hipRename["cudaErrorInvalidMemcpyDirection"] = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 21 - cuda2hipRename["cudaErrorAddressOfConstant"] = {"hipErrorAddressOfConstant", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 22 - cuda2hipRename["cudaErrorTextureFetchFailed"] = {"hipErrorTextureFetchFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 23 - cuda2hipRename["cudaErrorTextureNotBound"] = {"hipErrorTextureNotBound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 24 - cuda2hipRename["cudaErrorSynchronizationError"] = {"hipErrorSynchronizationError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 25 - cuda2hipRename["cudaErrorInvalidFilterSetting"] = {"hipErrorInvalidFilterSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 26 - cuda2hipRename["cudaErrorInvalidNormSetting"] = {"hipErrorInvalidNormSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 27 - cuda2hipRename["cudaErrorMixedDeviceExecution"] = {"hipErrorMixedDeviceExecution", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 28 + cuda2hipRename["cudaErrorMissingConfiguration"] = {"hipErrorMissingConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["cudaErrorPriorLaunchFailure"] = {"hipErrorPriorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 5 + cuda2hipRename["cudaErrorInvalidDeviceFunction"] = {"hipErrorInvalidDeviceFunction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 8 + cuda2hipRename["cudaErrorInvalidConfiguration"] = {"hipErrorInvalidConfiguration", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 9 + cuda2hipRename["cudaErrorInvalidPitchValue"] = {"hipErrorInvalidPitchValue", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 12 + cuda2hipRename["cudaErrorInvalidSymbol"] = {"hipErrorInvalidSymbol", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 13 + cuda2hipRename["cudaErrorInvalidHostPointer"] = {"hipErrorInvalidHostPointer", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 16 + cuda2hipRename["cudaErrorInvalidDevicePointer"] = {"hipErrorInvalidDevicePointer", CONV_ERR, API_RUNTIME}; // 17 + cuda2hipRename["cudaErrorInvalidTexture"] = {"hipErrorInvalidTexture", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 18 + cuda2hipRename["cudaErrorInvalidTextureBinding"] = {"hipErrorInvalidTextureBinding", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 19 + cuda2hipRename["cudaErrorInvalidChannelDescriptor"] = {"hipErrorInvalidChannelDescriptor", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 20 + cuda2hipRename["cudaErrorInvalidMemcpyDirection"] = {"hipErrorInvalidMemcpyDirection", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 21 + cuda2hipRename["cudaErrorAddressOfConstant"] = {"hipErrorAddressOfConstant", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 22 + cuda2hipRename["cudaErrorTextureFetchFailed"] = {"hipErrorTextureFetchFailed", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 23 + cuda2hipRename["cudaErrorTextureNotBound"] = {"hipErrorTextureNotBound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 24 + cuda2hipRename["cudaErrorSynchronizationError"] = {"hipErrorSynchronizationError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 25 + cuda2hipRename["cudaErrorInvalidFilterSetting"] = {"hipErrorInvalidFilterSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 26 + cuda2hipRename["cudaErrorInvalidNormSetting"] = {"hipErrorInvalidNormSetting", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 27 + cuda2hipRename["cudaErrorMixedDeviceExecution"] = {"hipErrorMixedDeviceExecution", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 28 // Deprecated as of CUDA 4.1 - cuda2hipRename["cudaErrorNotYetImplemented"] = {"hipErrorNotYetImplemented", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 31 + cuda2hipRename["cudaErrorNotYetImplemented"] = {"hipErrorNotYetImplemented", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 31 // Deprecated as of CUDA 3.1 - cuda2hipRename["cudaErrorMemoryValueTooLarge"] = {"hipErrorMemoryValueTooLarge", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 32 - cuda2hipRename["cudaErrorInsufficientDriver"] = {"hipErrorInsufficientDriver", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 35 - cuda2hipRename["cudaErrorSetOnActiveProcess"] = {"hipErrorSetOnActiveProcess", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 36 - cuda2hipRename["cudaErrorInvalidSurface"] = {"hipErrorInvalidSurface", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 37 - cuda2hipRename["cudaErrorDuplicateVariableName"] = {"hipErrorDuplicateVariableName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 43 - cuda2hipRename["cudaErrorDuplicateTextureName"] = {"hipErrorDuplicateTextureName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 44 - cuda2hipRename["cudaErrorDuplicateSurfaceName"] = {"hipErrorDuplicateSurfaceName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 45 - cuda2hipRename["cudaErrorDevicesUnavailable"] = {"hipErrorDevicesUnavailable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 46 - cuda2hipRename["cudaErrorIncompatibleDriverContext"] = {"hipErrorIncompatibleDriverContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 49 - cuda2hipRename["cudaErrorDeviceAlreadyInUse"] = {"hipErrorDeviceAlreadyInUse", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 54 - cuda2hipRename["cudaErrorAssert"] = {"hipErrorAssert", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 59 - cuda2hipRename["cudaErrorTooManyPeers"] = {"hipErrorTooManyPeers", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 60 - cuda2hipRename["cudaErrorLaunchMaxDepthExceeded"] = {"hipErrorLaunchMaxDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 65 - cuda2hipRename["cudaErrorLaunchFileScopedTex"] = {"hipErrorLaunchFileScopedTex", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 66 - cuda2hipRename["cudaErrorLaunchFileScopedSurf"] = {"hipErrorLaunchFileScopedSurf", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 67 - cuda2hipRename["cudaErrorSyncDepthExceeded"] = {"hipErrorSyncDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 68 - cuda2hipRename["cudaErrorLaunchPendingCountExceeded"] = {"hipErrorLaunchPendingCountExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 69 - cuda2hipRename["cudaErrorNotPermitted"] = {"hipErrorNotPermitted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 70 - cuda2hipRename["cudaErrorNotSupported"] = {"hipErrorNotSupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 71 - cuda2hipRename["cudaErrorHardwareStackError"] = {"hipErrorHardwareStackError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 72 - cuda2hipRename["cudaErrorIllegalInstruction"] = {"hipErrorIllegalInstruction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 73 - cuda2hipRename["cudaErrorMisalignedAddress"] = {"hipErrorMisalignedAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 74 - cuda2hipRename["cudaErrorInvalidAddressSpace"] = {"hipErrorInvalidAddressSpace", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 75 - cuda2hipRename["cudaErrorInvalidPc"] = {"hipErrorInvalidPc", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 76 - cuda2hipRename["cudaErrorStartupFailure"] = {"hipErrorStartupFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 0x7f + cuda2hipRename["cudaErrorMemoryValueTooLarge"] = {"hipErrorMemoryValueTooLarge", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 32 + cuda2hipRename["cudaErrorInsufficientDriver"] = {"hipErrorInsufficientDriver", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 35 + cuda2hipRename["cudaErrorSetOnActiveProcess"] = {"hipErrorSetOnActiveProcess", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 36 + cuda2hipRename["cudaErrorInvalidSurface"] = {"hipErrorInvalidSurface", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 37 + cuda2hipRename["cudaErrorDuplicateVariableName"] = {"hipErrorDuplicateVariableName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 43 + cuda2hipRename["cudaErrorDuplicateTextureName"] = {"hipErrorDuplicateTextureName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 44 + cuda2hipRename["cudaErrorDuplicateSurfaceName"] = {"hipErrorDuplicateSurfaceName", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 45 + cuda2hipRename["cudaErrorDevicesUnavailable"] = {"hipErrorDevicesUnavailable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 46 + cuda2hipRename["cudaErrorIncompatibleDriverContext"] = {"hipErrorIncompatibleDriverContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 49 + cuda2hipRename["cudaErrorDeviceAlreadyInUse"] = {"hipErrorDeviceAlreadyInUse", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 54 + cuda2hipRename["cudaErrorLaunchMaxDepthExceeded"] = {"hipErrorLaunchMaxDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 65 + cuda2hipRename["cudaErrorLaunchFileScopedTex"] = {"hipErrorLaunchFileScopedTex", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 66 + cuda2hipRename["cudaErrorLaunchFileScopedSurf"] = {"hipErrorLaunchFileScopedSurf", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 67 + cuda2hipRename["cudaErrorSyncDepthExceeded"] = {"hipErrorSyncDepthExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 68 + cuda2hipRename["cudaErrorLaunchPendingCountExceeded"] = {"hipErrorLaunchPendingCountExceeded", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 69 + cuda2hipRename["cudaErrorNotPermitted"] = {"hipErrorNotPermitted", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 70 + cuda2hipRename["cudaErrorNotSupported"] = {"hipErrorNotSupported", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 71 + cuda2hipRename["cudaErrorStartupFailure"] = {"hipErrorStartupFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 0x7f // Deprecated as of CUDA 4.1 - cuda2hipRename["cudaErrorApiFailureBase"] = {"hipErrorApiFailureBase", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 10000 - - + cuda2hipRename["cudaErrorApiFailureBase"] = {"hipErrorApiFailureBase", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 10000 cuda2hipRename["CUDA_SUCCESS"] = {"hipSuccess", CONV_ERR, API_DRIVER}; // 0 cuda2hipRename["cudaSuccess"] = {"hipSuccess", CONV_ERR, API_RUNTIME}; // 0 @@ -346,34 +342,50 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_ILLEGAL_ADDRESS"] = {"hipErrorIllegalAddress", CONV_ERR, API_DRIVER}; // 700 cuda2hipRename["cudaErrorIllegalAddress"] = {"hipErrorIllegalAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 77 - ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + cuda2hipRename["CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_DRIVER}; // 701 + cuda2hipRename["cudaErrorLaunchOutOfResources"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_RUNTIME}; // 7 - cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"] = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER}; // 719 + cuda2hipRename["CUDA_ERROR_LAUNCH_TIMEOUT"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_DRIVER}; // 702 + cuda2hipRename["cudaErrorLaunchTimeout"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 6 + + cuda2hipRename["CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_DRIVER}; // 704 + cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME}; // 50 + + cuda2hipRename["CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_DRIVER}; // 705 + cuda2hipRename["cudaErrorPeerAccessNotEnabled"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME}; // 51 + + cuda2hipRename["CUDA_ERROR_ASSERT"] = {"hipErrorAssert", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 710 + cuda2hipRename["cudaErrorAssert"] = {"hipErrorAssert", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 59 + + cuda2hipRename["CUDA_ERROR_TOO_MANY_PEERS"] = {"hipErrorTooManyPeers", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 711 + cuda2hipRename["cudaErrorTooManyPeers"] = {"hipErrorTooManyPeers", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 60 + + cuda2hipRename["CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_DRIVER}; // 712 + cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME}; // 61 + + cuda2hipRename["CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_DRIVER}; // 713 + cuda2hipRename["cudaErrorHostMemoryNotRegistered"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME}; // 62 + + cuda2hipRename["CUDA_ERROR_HARDWARE_STACK_ERROR"] = {"hipErrorHardwareStackError", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 714 + cuda2hipRename["cudaErrorHardwareStackError"] = {"hipErrorHardwareStackError", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 72 + + cuda2hipRename["CUDA_ERROR_ILLEGAL_INSTRUCTION"] = {"hipErrorIllegalInstruction", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 715 + cuda2hipRename["cudaErrorIllegalInstruction"] = {"hipErrorIllegalInstruction", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 73 + + cuda2hipRename["CUDA_ERROR_MISALIGNED_ADDRESS"] = {"hipErrorMisalignedAddress", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 716 + cuda2hipRename["cudaErrorMisalignedAddress"] = {"hipErrorMisalignedAddress", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 74 + + cuda2hipRename["CUDA_ERROR_INVALID_ADDRESS_SPACE"] = {"hipErrorInvalidAddressSpace", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 717 + cuda2hipRename["cudaErrorInvalidAddressSpace"] = {"hipErrorInvalidAddressSpace", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 75 + + cuda2hipRename["CUDA_ERROR_INVALID_PC"] = {"hipErrorInvalidPc", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 718 + cuda2hipRename["cudaErrorInvalidPc"] = {"hipErrorInvalidPc", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 76 + + cuda2hipRename["CUDA_ERROR_LAUNCH_FAILED"] = {"hipErrorLaunchFailure", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 719 cuda2hipRename["cudaErrorLaunchFailure"] = {"hipErrorLaunchFailure", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 4 - cuda2hipRename["CUDA_ERROR_LAUNCH_TIMEOUT"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorLaunchTimeout"] = {"hipErrorLaunchTimeOut", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 6 - - cuda2hipRename["CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorLaunchOutOfResources"] = {"hipErrorLaunchOutOfResources", CONV_ERR, API_RUNTIME}; // 7 - - cuda2hipRename["CUDA_ERROR_UNKNOWN"] = {"hipErrorUnknown", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30 - -// cuda2hipRename["CUDA_ERROR_NOT_INITIALIZED"] = {"hipErrorInitializationError", CONV_ERR, API_DRIVER}; -// cuda2hipRename["cudaErrorInitializationError"] = {"hipErrorInitializationError", CONV_ERR, API_RUNTIME}; - - cuda2hipRename["CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorPeerAccessAlreadyEnabled"] = {"hipErrorPeerAccessAlreadyEnabled", CONV_ERR, API_RUNTIME}; // 50 - - cuda2hipRename["CUDA_ERROR_PEER_ACCESS_NOT_ENABLED"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorPeerAccessNotEnabled"] = {"hipErrorPeerAccessNotEnabled", CONV_ERR, API_RUNTIME}; // 51 - - cuda2hipRename["CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorHostMemoryAlreadyRegistered"] = {"hipErrorHostMemoryAlreadyRegistered", CONV_ERR, API_RUNTIME}; // 61 - - cuda2hipRename["CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_DRIVER}; - cuda2hipRename["cudaErrorHostMemoryNotRegistered"] = {"hipErrorHostMemoryNotRegistered", CONV_ERR, API_RUNTIME}; // 62 + cuda2hipRename["CUDA_ERROR_UNKNOWN"] = {"hipErrorUnknown", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 999 + cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30 ///////////////////////////// CUDA DRIVER API ///////////////////////////// // enums @@ -389,59 +401,86 @@ struct cuda2hipMap { cuda2hipRename["CUipcEventHandle"] = {"hipIpcEventHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CUipcMemHandle"] = {"hipIpcMemHandle", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - - cuda2hipRename["CUaddress_mode"] = {"hipAddress_mode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_TR_ADDRESS_MODE_WRAP"] = {"HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0 - cuda2hipRename["CU_TR_ADDRESS_MODE_CLAMP"] = {"HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_TR_ADDRESS_MODE_MIRROR"] = {"HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_TR_ADDRESS_MODE_BORDER"] = {"HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CU_TR_ADDRESS_MODE_WRAP"] = {"HIP_TR_ADDRESS_MODE_WRAP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0 + cuda2hipRename["CU_TR_ADDRESS_MODE_CLAMP"] = {"HIP_TR_ADDRESS_MODE_CLAMP", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_TR_ADDRESS_MODE_MIRROR"] = {"HIP_TR_ADDRESS_MODE_MIRROR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_TR_ADDRESS_MODE_BORDER"] = {"HIP_TR_ADDRESS_MODE_BORDER", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 cuda2hipRename["CUarray_cubemap_face"] = {"hipArray_cubemap_face", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_X"] = {"HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_X"] = {"HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Y"] = {"HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Y"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 - cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Z"] = {"HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Z"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x05 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_X"] = {"HIP_CUBEMAP_FACE_POSITIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_X"] = {"HIP_CUBEMAP_FACE_NEGATIVE_X", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Y"] = {"HIP_CUBEMAP_FACE_POSITIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Y"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Y", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_CUBEMAP_FACE_POSITIVE_Z"] = {"HIP_CUBEMAP_FACE_POSITIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CUBEMAP_FACE_NEGATIVE_Z"] = {"HIP_CUBEMAP_FACE_NEGATIVE_Z", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x05 cuda2hipRename["CUarray_format"] = {"hipArray_format", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT8"] = {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT16"] = {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT32"] = {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 - cuda2hipRename["CU_AD_FORMAT_SIGNED_INT8"] = {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_AD_FORMAT_SIGNED_INT16"] = {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x09 - cuda2hipRename["CU_AD_FORMAT_SIGNED_INT32"] = {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a - cuda2hipRename["CU_AD_FORMAT_HALF"] = {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_AD_FORMAT_FLOAT"] = {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT8"] = {"HIP_AD_FORMAT_UNSIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT16"] = {"HIP_AD_FORMAT_UNSIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_AD_FORMAT_UNSIGNED_INT32"] = {"HIP_AD_FORMAT_UNSIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT8"] = {"HIP_AD_FORMAT_SIGNED_INT8", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT16"] = {"HIP_AD_FORMAT_SIGNED_INT16", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x09 + cuda2hipRename["CU_AD_FORMAT_SIGNED_INT32"] = {"HIP_AD_FORMAT_SIGNED_INT32", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x0a + cuda2hipRename["CU_AD_FORMAT_HALF"] = {"HIP_AD_FORMAT_HALF", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_AD_FORMAT_FLOAT"] = {"HIP_AD_FORMAT_FLOAT", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x20 // Compute mode - cuda2hipRename["CUcomputemode"] = {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) - cuda2hipRename["CU_COMPUTEMODE_DEFAULT"] = {"hipComputeModeDefault", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0 - cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CUcomputemode"] = {"hipComputemode", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) + cuda2hipRename["CU_COMPUTEMODE_DEFAULT"] = {"hipComputeModeDefault", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0 // API_RUNTIME ANALOGUE (cudaComputeModeDefault = 0) + cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaComputeModeExclusive = 1) + cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaComputeModeProhibited = 2) + cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaComputeModeExclusiveProcess = 3) // Context flags cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 - cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f + cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 + cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f // Defines - cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_SIZE"] = {"HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_DEV, API_DRIVER}; - cuda2hipRename["CU_LAUNCH_PARAM_END"] = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER}; + cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; // ((void*)0x01) + cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_SIZE"] = {"HIP_LAUNCH_PARAM_BUFFER_SIZE", CONV_DEV, API_DRIVER}; // ((void*)0x02) + cuda2hipRename["CU_LAUNCH_PARAM_END"] = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER}; // ((void*)0x00) + cuda2hipRename["CU_IPC_HANDLE_SIZE"] = {"HIP_LAUNCH_PARAM_END", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 + cuda2hipRename["CU_MEMHOSTALLOC_DEVICEMAP"] = {"HIP_MEMHOSTALLOC_DEVICEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_MEMHOSTALLOC_PORTABLE"] = {"HIP_MEMHOSTALLOC_PORTABLE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_MEMHOSTALLOC_WRITECOMBINED"] = {"HIP_MEMHOSTALLOC_WRITECOMBINED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_MEMHOSTREGISTER_DEVICEMAP"] = {"HIP_MEMHOSTREGISTER_DEVICEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_MEMHOSTREGISTER_IOMEMORY"] = {"HIP_MEMHOSTREGISTER_IOMEMORY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_MEMHOSTREGISTER_PORTABLE"] = {"HIP_MEMHOSTREGISTER_PORTABLE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_PARAM_TR_DEFAULT"] = {"HIP_PARAM_TR_DEFAULT", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // -1 + cuda2hipRename["CU_STREAM_LEGACY"] = {"HIP_STREAM_LEGACY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // ((CUstream)0x1) + cuda2hipRename["CU_STREAM_PER_THREAD"] = {"HIP_STREAM_PER_THREAD", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // ((CUstream)0x2) + cuda2hipRename["CU_TRSA_OVERRIDE_FORMAT"] = {"HIP_TRSA_OVERRIDE_FORMAT", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_TRSF_NORMALIZED_COORDINATES"] = {"HIP_TRSF_NORMALIZED_COORDINATES", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};// 0x02 + cuda2hipRename["CU_TRSF_READ_AS_INTEGER"] = {"HIP_TRSF_READ_AS_INTEGER", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_TRSF_SRGB"] = {"HIP_TRSF_SRGB", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + // Deprecated, use CUDA_ARRAY3D_LAYERED + cuda2hipRename["CUDA_ARRAY3D_2DARRAY"] = {"HIP_ARRAY3D_LAYERED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CUDA_ARRAY3D_CUBEMAP"] = {"HIP_ARRAY3D_CUBEMAP", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CUDA_ARRAY3D_DEPTH_TEXTURE"] = {"HIP_ARRAY3D_DEPTH_TEXTURE", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CUDA_ARRAY3D_LAYERED"] = {"HIP_ARRAY3D_LAYERED", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CUDA_ARRAY3D_SURFACE_LDST"] = {"HIP_ARRAY3D_SURFACE_LDST", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CUDA_ARRAY3D_TEXTURE_GATHER"] = {"HIP_ARRAY3D_TEXTURE_GATHER", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CUDA_VERSION"] = {"HIP_VERSION", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7050 // Types // NOTE: CUdevice might be changed to typedef int in the future. cuda2hipRename["CUdevice"] = {"hipDevice_t", CONV_TYPE, API_DRIVER}; - cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) - cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdevice_attribute_enum"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdevice_attribute"] = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaDeviceAttr) + cuda2hipRename["CUdeviceptr"] = {"hipDeviceptr_t", CONV_TYPE, API_DRIVER}; + // CUDA: "The types::CUarray and struct ::cudaArray * represent the same data type and may be used interchangeably by casting the two types between each other." + // typedef struct cudaArray *cudaArray_t; + // typedef struct CUarray_st *CUarray; + cuda2hipRename["CUarray_st"] = {"hipArray", CONV_MEM, API_RUNTIME}; // API_Runtime ANALOGUE (cudaArray) + cuda2hipRename["CUarray"] = {"hipArray *", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaArray_t) + // unsupported yet by HIP cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) @@ -554,8 +593,8 @@ struct cuda2hipMap { // TODO: Analogues enum is needed in HIP. Couldn't map enum to struct hipPointerAttribute_t. // TODO: Do for Pointer Attributes the same as for Device Attributes. - // cuda2hipRename["CUpointer_attribute_enum"] = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) - // cuda2hipRename["CUpointer_attribute"] = {"hipPointerAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + // cuda2hipRename["CUpointer_attribute_enum"] = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) + // cuda2hipRename["CUpointer_attribute"] = {"hipPointerAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_CONTEXT"] = {"hipPointerAttributeContext", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_MEMORY_TYPE"] = {"hipPointerAttributeMemoryType", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_DEVICE_POINTER"] = {"hipPointerAttributeDevicePointer", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (no) @@ -565,13 +604,17 @@ struct cuda2hipMap { cuda2hipRename["CU_POINTER_ATTRIBUTE_BUFFER_ID"] = {"hipPointerAttributeBufferId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (no) cuda2hipRename["CU_POINTER_ATTRIBUTE_IS_MANAGED"] = {"hipPointerAttributeIsManaged", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (no) - // pointer to CUfunc_st cuda2hipRename["CUfunction"] = {"hipFunction_t", CONV_TYPE, API_DRIVER}; - // TODO: in HIP ihipModuleSymbol_t should be declared in hip_runtime_api.h, not in hcc_detail/hip_runtime_api.h, as it's analogue CUfunc_st is declared also in cuda.h - // ToDO: examples are needed with CUfunc_st + // TODO: move "typedef struct ihipModuleSymbol_t *hipFunction_t;" from hcc_details to HIP + // typedef struct CUfunc_st *CUfunction; // cuda2hipRename["CUfunc_st"] = {"ihipModuleSymbol_t", CONV_TYPE, API_DRIVER}; + // typedef struct CUgraphicsResource_st *CUgraphicsResource; + cuda2hipRename["CUgraphicsResource"] = {"hipGraphicsResource_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // typedef struct CUmipmappedArray_st *CUmipmappedArray; + cuda2hipRename["CUmipmappedArray"] = {"hipMipmappedArray_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // unsupported yet by HIP cuda2hipRename["CUfunction_attribute"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CUfunction_attribute_enum"] = {"hipFuncAttribute_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; @@ -607,8 +650,6 @@ struct cuda2hipMap { cuda2hipRename["CU_OCCUPANCY_DEFAULT"] = {"hipOccupancyDefault", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 // API_Runtime ANALOGUE (cudaOccupancyDefault = 0x0) cuda2hipRename["CU_OCCUPANCY_DISABLE_CACHING_OVERRIDE"] = {"hipOccupancyDisableCachingOverride", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaOccupancyDisableCachingOverride = 0x1) - - cuda2hipRename["CUfunc_cache_enum"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaFuncCache) cuda2hipRename["CUfunc_cache"] = {"hipFuncCache", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaFuncCache) cuda2hipRename["CU_FUNC_CACHE_PREFER_NONE"] = {"hipFuncCachePreferNone", CONV_CACHE, API_DRIVER}; // 0x00 // API_Runtime ANALOGUE (cudaFilterModePoint = 0) @@ -752,24 +793,41 @@ struct cuda2hipMap { cuda2hipRename["CU_RES_VIEW_FORMAT_SIGNED_BC6H"] = {"hipResViewFormatSignedBlockCompressed6H", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x21 // API_Runtime ANALOGUE (cudaResViewFormatSignedBlockCompressed6H = 0x21) cuda2hipRename["CU_RES_VIEW_FORMAT_UNSIGNED_BC7"] = {"hipResViewFormatUnsignedBlockCompressed7", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 0x22 // API_Runtime ANALOGUE (cudaResViewFormatUnsignedBlockCompressed7 = 0x22) - - - cuda2hipRename["CUsharedconfig_enum"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUsharedconfig"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; + cuda2hipRename["CUsharedconfig_enum"] = {"hipSharedMemConfig", CONV_TYPE, API_DRIVER}; cuda2hipRename["CU_SHARED_MEM_CONFIG_DEFAULT_BANK_SIZE"] = {"hipSharedMemBankSizeDefault", CONV_DEV, API_DRIVER}; cuda2hipRename["CU_SHARED_MEM_CONFIG_FOUR_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeFourByte", CONV_DEV, API_DRIVER}; cuda2hipRename["CU_SHARED_MEM_CONFIG_EIGHT_BYTE_BANK_SIZE"] = {"hipSharedMemBankSizeEightByte", CONV_DEV, API_DRIVER}; cuda2hipRename["CUcontext"] = {"hipCtx_t", CONV_TYPE, API_DRIVER}; - // TODO: - // cuda2hipRename["CUctx_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; + // TODO: move "typedef struct ihipCtx_t *hipCtx_t;" from hcc_details to HIP + // typedef struct CUctx_st *CUcontext; + // cuda2hipRename["CUctx_st"] = {"ihipCtx_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUmodule"] = {"hipModule_t", CONV_TYPE, API_DRIVER}; - // TODO: - // cuda2hipRename["CUmod_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; + // TODO: move "typedef struct ihipModule_t *hipModule_t;" from hcc_details to HIP + // typedef struct CUmod_st *CUmodule; + // cuda2hipRename["CUmod_st"] = {"ihipModule_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUstream"] = {"hipStream_t", CONV_TYPE, API_DRIVER}; - // TODO: - // cuda2hipRename["CUstream_st"] = {"XXXX", CONV_TYPE, API_DRIVER}; - // Stream Flags + // TODO: move "typedef struct ihipStream_t *hipStream_t;" from hcc_details to HIP + // typedef struct CUstream_st *CUstream; + // cuda2hipRename["CUstream_st"] = {"ihipStream_t", CONV_TYPE, API_DRIVER}; + + // typedef void (*hipStreamCallback_t) (hipStream_t stream, hipError_t status, void* userData); + // typedef void (CUDA_CB *CUstreamCallback) (CUstream hStream, CUresult status, void* userData) + cuda2hipRename["CUstreamCallback"] = {"hipStreamCallback_t", CONV_TYPE, API_DRIVER}; + + cuda2hipRename["CUsurfObject"] = {"hipSurfaceObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // typedef struct CUsurfref_st *CUsurfref; + cuda2hipRename["CUsurfref"] = {"hipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUsurfref_st"] = {"ihipSurfaceReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUtexObject"] = {"hipTextureObject", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // typedef struct CUtexref_st *CUtexref; + cuda2hipRename["CUtexref"] = {"hipTextureReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUtexref_st"] = {"ihipTextureReference_t", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + + // Stream Flags enum + cuda2hipRename["CUstream_flags"] = {"hipStreamFlags", CONV_STREAM, API_DRIVER}; + // cuda2hipRename["CUstream_flags_enum"] = {"hipStreamFlags", CONV_STREAM, API_DRIVER}; cuda2hipRename["CU_STREAM_DEFAULT"] = {"hipStreamDefault", CONV_STREAM, API_DRIVER}; cuda2hipRename["CU_STREAM_NON_BLOCKING"] = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER}; @@ -1254,10 +1312,10 @@ struct cuda2hipMap { // Compute mode cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) - cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 - cuda2hipRename["cudaComputeModeExclusive"] = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["cudaComputeModeProhibited"] = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["cudaComputeModeExclusiveProcess"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_DEFAULT = 0) + cuda2hipRename["cudaComputeModeExclusive"] = {"hipComputeModeExclusive", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_EXCLUSIVE = 1) + cuda2hipRename["cudaComputeModeProhibited"] = {"hipComputeModeProhibited", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_PROHIBITED = 2) + cuda2hipRename["cudaComputeModeExclusiveProcess"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_EXCLUSIVE_PROCESS = 3) // Device Flags // unsupported yet by HIP From df05e9676029da1c0b268488587886c8d5b06987 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 3 May 2017 22:45:56 +0300 Subject: [PATCH 064/171] [HIPIFY] Blas update: add a few functions, supported by HIP. cublasDaxpy -> hipblasDaxpy cublasDgemv -> hipblasDgemv cublasDger -> hipblasDger cublasDgemm -> hipblasDgemm cublasDgemmBatched -> hipblasDgemmBatched cublasGetStream -> hipblasGetStream cublasSetStream -> hipblasSetStream cublasDaxpy -> hipblasDaxpy [ROCm/clr commit: 72df25a0c7dbab0271c29cf10551132916bfc9e9] --- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index b3b2b993e3..3b34f0c1c1 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -1601,7 +1601,7 @@ struct cuda2hipMap { // Blas types cuda2hipRename["cublasHandle_t"] = {"hipblasHandle_t", CONV_TYPE, API_BLAS}; // TODO: dereferencing: typedef struct cublasContext *cublasHandle_t; - cuda2hipRename["cublasContext"] = {"hipblasHandle_t", CONV_TYPE, API_BLAS}; + // cuda2hipRename["cublasContext"] = {"hipblasHandle_t", CONV_TYPE, API_BLAS}; // Blas management functions // unsupported yet by hipblas/hcblas cuda2hipRename["cublasInit"] = {"hipblasInit", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1709,10 +1709,9 @@ struct cuda2hipMap { // AXPY cuda2hipRename["cublasSaxpy"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS}; - // there is no such a function in CUDA cuda2hipRename["cublasSaxpyBatched"] = {"hipblasSaxpyBatched", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDaxpy"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCaxpy"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZaxpy"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1789,8 +1788,8 @@ struct cuda2hipMap { cuda2hipRename["cublasSgemv"] = {"hipblasSgemv", CONV_MATH_FUNC, API_BLAS}; // there is no such a function in CUDA cuda2hipRename["cublasSgemvBatched"] = {"hipblasSgemvBatched", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDgemv"] = {"hipblasDgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasCgemv"] = {"hipblasCgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZgemv"] = {"hipblasZgemv", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1868,8 +1867,8 @@ struct cuda2hipMap { // GER cuda2hipRename["cublasSger"] = {"hipblasSger", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasDger"] = {"hipblasDger", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDger"] = {"hipblasDger", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasCgeru"] = {"hipblasCgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasCgerc"] = {"hipblasCgerc", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZgeru"] = {"hipblasZgeru", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -1906,8 +1905,7 @@ struct cuda2hipMap { // Blas3 (v1) Routines // GEMM cuda2hipRename["cublasSgemm"] = {"hipblasSgemm", CONV_MATH_FUNC, API_BLAS}; - // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDgemm"] = {"hipblasDgemm", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCgemm"] = {"hipblasCgemm", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas @@ -1915,8 +1913,7 @@ struct cuda2hipMap { // BATCH GEMM cuda2hipRename["cublasSgemmBatched"] = {"hipblasSgemmBatched", CONV_MATH_FUNC, API_BLAS}; - // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDgemmBatched"] = {"hipblasDgemmBatched", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCgemmBatched"] = {"hipblasCgemmBatched", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas @@ -2074,10 +2071,9 @@ struct cuda2hipMap { cuda2hipRename["cublasCreate_v2"] = {"hipblasCreate", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasDestroy_v2"] = {"hipblasDestroy", CONV_MATH_FUNC, API_BLAS}; - // unsupported yet by hipblas/hcblas cuda2hipRename["cublasGetVersion_v2"] = {"hipblasGetVersion", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; - cuda2hipRename["cublasSetStream_v2"] = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; - cuda2hipRename["cublasGetStream_v2"] = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasSetStream_v2"] = {"hipblasSetStream", CONV_MATH_FUNC, API_BLAS}; + cuda2hipRename["cublasGetStream_v2"] = {"hipblasGetStream", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasGetPointerMode_v2"] = {"hipblasGetPointerMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasSetPointerMode_v2"] = {"hipblasSetPointerMode", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; @@ -2294,7 +2290,7 @@ struct cuda2hipMap { // AXPY cuda2hipRename["cublasSaxpy_v2"] = {"hipblasSaxpy", CONV_MATH_FUNC, API_BLAS}; // unsupported yet by hipblas/hcblas - cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; + cuda2hipRename["cublasDaxpy_v2"] = {"hipblasDaxpy", CONV_MATH_FUNC, API_BLAS}; cuda2hipRename["cublasCaxpy_v2"] = {"hipblasCaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; cuda2hipRename["cublasZaxpy_v2"] = {"hipblasZaxpy", CONV_MATH_FUNC, API_BLAS, HIP_UNSUPPORTED}; From e9ad0f3b00c498095b7de74c52d70ae63c780784 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 3 May 2017 23:05:44 +0300 Subject: [PATCH 065/171] [HIPIFY] CUDA RT memcpy functions update. cudaMemcpyFromSymbol -> hipMemcpyFromSymbol cudaMemcpyFromSymbolAsync -> hipMemcpyFromSymbolAsync cudaMemcpy2DAsync -> hipMemcpy2DAsync [ROCm/clr commit: 2636d91ba4ee46d54a70673a7cc1a7fa8f2dfc85] --- projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 3b34f0c1c1..07fc817b53 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -1030,10 +1030,10 @@ struct cuda2hipMap { cuda2hipRename["cudaMemcpyToSymbolAsync"] = {"hipMemcpyToSymbolAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpyAsync"] = {"hipMemcpyAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpy2D"] = {"hipMemcpy2D", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpy2DAsync"] = {"hipMemcpy2DAsync", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpy2DToArray"] = {"hipMemcpy2DToArray", CONV_MEM, API_RUNTIME}; // unsupported yet by HIP cuda2hipRename["cudaMemcpy2DArrayToArray"] = {"hipMemcpy2DArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaMemcpy2DAsync"] = {"hipMemcpy2DAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpy2DFromArray"] = {"hipMemcpy2DFromArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpy2DFromArrayAsync"] = {"hipMemcpy2DFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpy2DToArrayAsync"] = {"hipMemcpy2DToArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; @@ -1043,7 +1043,8 @@ struct cuda2hipMap { cuda2hipRename["cudaMemcpy3DPeerAsync"] = {"hipMemcpy3DPeerAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpyArrayToArray"] = {"hipMemcpyArrayToArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpyFromArrayAsync"] = {"hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaMemcpyFromSymbolAsync"] = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaMemcpyFromSymbol"] = {"hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemcpyFromSymbolAsync"] = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME}; // memcpy kind cuda2hipRename["cudaMemcpyKind"] = {"hipMemcpyKind", CONV_MEM, API_RUNTIME}; From ba1a3360abe77b7146ab1a2813e9a7e169c56806 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 4 May 2017 06:47:55 +0530 Subject: [PATCH 066/171] hipMemcpy2DAsync for HIP/NVCC Change-Id: I46f0057fef49bdaaac41c1df80c3e27432b8f376 [ROCm/clr commit: f77059218c2f26ff2318e63e704fb38a5e823316] --- .../clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index 4feefcc342..aad3ffcc44 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -345,7 +345,11 @@ inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolN } inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind){ - return hipCUDAErrorTohipError(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind))); + return hipCUDAErrorTohipError(cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind))); +} + +inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { + return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind),stream)); } inline static hipError_t hipMemcpy2DToArray(hipArray *dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind){ From 3028345953cbf88d7b658c2f0a2ca27a7b073236 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 5 May 2017 21:28:02 +0300 Subject: [PATCH 067/171] [HIPIFY] LLVM 3.9 support 3.8 and 3.9 are both supported. 3.8 is stable, 3.9 needs more testing. [ROCm/clr commit: 054bf0859db601ff17472c96bf8a820db0ac2f02] --- projects/clr/hipamd/hipify-clang/CMakeLists.txt | 9 +++++++-- projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 4 ++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/CMakeLists.txt b/projects/clr/hipamd/hipify-clang/CMakeLists.txt index a02b91407f..872db3defe 100644 --- a/projects/clr/hipamd/hipify-clang/CMakeLists.txt +++ b/projects/clr/hipamd/hipify-clang/CMakeLists.txt @@ -6,8 +6,12 @@ set(BUILD_HIPIFY_CLANG 0 PARENT_SCOPE) # Find LLVM package find_package(LLVM 3.8 QUIET PATHS ${HIPIFY_CLANG_LLVM_DIR} NO_DEFAULT_PATH) if (NOT ${LLVM_FOUND}) - message(STATUS "hipify-clang will not be built. To build it please specify absolute path to LLVM (v3.8) package using -DHIPIFY_CLANG_LLVM_DIR") -else() + find_package(LLVM 3.9 QUIET PATHS ${HIPIFY_CLANG_LLVM_DIR} NO_DEFAULT_PATH) + if (NOT ${LLVM_FOUND}) + message(STATUS "hipify-clang will not be built. To build it please specify absolute path to LLVM 3.8 or LLVM 3.9 package using -DHIPIFY_CLANG_LLVM_DIR") + endif() +endif() +if (${LLVM_FOUND}) list(APPEND CMAKE_MODULE_PATH ${LLVM_CMAKE_DIR}) include(AddLLVM) @@ -31,6 +35,7 @@ else() clangSerialization clangSema clangEdit + clangFormat clangLex clangAnalysis clangDriver diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 07fc817b53..1c6406b4ab 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -3533,7 +3533,11 @@ void printAllStats(const std::string &csvFile, int64_t totalFiles, int64_t conve int main(int argc, const char **argv) { auto start = std::chrono::steady_clock::now(); auto begin = start; +#if (LLVM_VERSION_MAJOR >= 3) && (LLVM_VERSION_MINOR >= 9) + llvm::sys::PrintStackTraceOnErrorSignal(StringRef()); +#else llvm::sys::PrintStackTraceOnErrorSignal(); +#endif CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore); std::vector fileSources = OptionsParser.getSourcePathList(); std::string dst = OutputFilename; From 618f4bf70275c5ba5e5323058030579c2a162776 Mon Sep 17 00:00:00 2001 From: wsttiger Date: Wed, 3 May 2017 14:21:08 +0000 Subject: [PATCH 068/171] Improve hipStreamWaitEvent test. - use addOne kernel, use local initializer rather than init_array. - use addOneReverse test to add from back of array. Test alternate fwd and backward to stress dependency logic. - check device-side dependencies. [ROCm/clr commit: a10d37e5e6993278b894d8c05103d3fbab055d37] --- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 154 ++++++++++++++++-- 1 file changed, 137 insertions(+), 17 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 637275c381..63c42da557 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -28,7 +28,41 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" #include +#include unsigned p_streams = 6; +unsigned p_db = 0; + + +template +__global__ void +addOne( const T *A_d, + T *C_d, + size_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (size_t i=offset; i +__global__ void +addOneReverse( const T *A_d, + T *C_d, + int64_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { + C_d[i] = A_d[i] + (T)1; + //C_d[i] = (T)1; + } +} //------ @@ -36,49 +70,90 @@ unsigned p_streams = 6; template class Streamer { public: - Streamer(size_t numElements); + Streamer(T *input, size_t numElements, bool reverse); ~Streamer(); - void runAsync(); + void runAsyncAfter(Streamer *depStreamer); + void runAsyncWaitSameStream(); void queryUntilComplete(); + void syncAndCheck(int streamerNum, T initValue, T expectedOffset); + + hipEvent_t event() { return _event; }; + + T *C_d() { return _C_d; }; + private: - T *_A_h; - T *_B_h; T *_C_h; T *_A_d; - T *_B_d; T *_C_d; hipStream_t _stream; hipEvent_t _event; size_t _numElements; + bool _reverse; }; + template -Streamer::Streamer(size_t numElements) : - _numElements(numElements) +Streamer::Streamer(T * A_d, size_t numElements, bool reverse) : + _A_d(A_d), + _numElements(numElements), + _reverse(reverse) { - HipTest::initArrays (&_A_d, &_B_d, &_C_d, &_A_h, &_B_h, &_C_h, numElements, true); + size_t sizeElements = numElements * sizeof(int); + + HIPCHECK(hipMalloc(&_C_d, sizeElements)); + HIPCHECK(hipHostMalloc(&_C_h, sizeElements)); + + HIPCHECK(hipMemset(_C_d, -1, sizeElements)); + HIPCHECK(hipMemset(_C_h, -2, sizeElements)); HIPCHECK(hipStreamCreate(&_stream)); HIPCHECK(hipEventCreate(&_event)); }; + template -void Streamer::runAsync() +void Streamer::runAsyncAfter(Streamer *depStreamer) +{ + if (p_db) { + printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); + } + + if (depStreamer) { + HIPCHECK(hipStreamWaitEvent(_stream, depStreamer->event(), 0)); + } + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); + if (_reverse) { + hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } else { + hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } + HIPCHECK(hipEventRecord(_event, _stream)); +} + + +template +void Streamer::runAsyncWaitSameStream() { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements); + if (_reverse) { + hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } else { + hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + } // Test case where hipStreamWaitEvent waits on same event we just placed into the queue. HIPCHECK(hipEventRecord(_event, _stream)); HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); } + template void Streamer::queryUntilComplete() { @@ -89,10 +164,26 @@ void Streamer::queryUntilComplete() e = hipStreamQuery(_stream); } while (e != hipSuccess) ; - printf ("completed after %d queries\n", numQueries); + printf ("info: hipStreamQuery completed after %d queries\n", numQueries); }; +template +void Streamer::syncAndCheck(int streamerNum, T initValue, T expectedOffset) +{ + HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, _stream)); + HIPCHECK(hipStreamSynchronize(_stream)); + + T expected = initValue + expectedOffset; + + for (size_t i=0; i<_numElements; i++) { + if (_C_h[i] != expected) { + failed("for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + } + } +} + + //--- //Parse arguments specific to this test. @@ -122,39 +213,68 @@ int main(int argc, char *argv[]) HipTest::parseStandardArguments(argc, argv, false); parseMyArguments(argc, argv); - typedef Streamer FloatStreamer; + typedef Streamer IntStreamer; - std::vector streamers; + std::vector streamers; size_t numElements = N; + size_t sizeElements = numElements * sizeof(int); + + assert (sizeElements <= std::numeric_limits::max()); + + + int initValue = 1000; + + int * initArray_d, *initArray_h; + HIPCHECK(hipMalloc(&initArray_d, sizeElements)); + HIPCHECK(hipHostMalloc(&initArray_h, sizeElements)); + for (size_t i=0; iC_d() : initArray_d, numElements, i&1 /*reverse?*/); streamers.push_back(s); } if (p_tests & 0x1) { - printf ("==> Test 0x1 runAsnc\n"); + printf ("==> Test 0x1 runAsyncAfter\n"); for (int i=0; irunAsync(); + streamers[i]->runAsyncAfter(i ? streamers[i-1] : NULL); } HIPCHECK(hipDeviceSynchronize()); + + for (int i=0; isyncAndCheck(i+1, initValue, i+1); + } } if (p_tests & 0x2) { printf ("==> Test 0x2 queryUntilComplete\n"); for (int i=0; irunAsync(); + streamers[i]->runAsyncAfter(i ? streamers[i-1] : NULL); streamers[i]->queryUntilComplete(); } HIPCHECK(hipDeviceSynchronize()); } if (p_tests & 0x4) { + printf ("==> Test 0x4 try null stream"); hipStreamQuery(0/* try null stream*/); } + if (p_tests & 0x8) { + printf ("==> Test 0x8 runAsyncWaitSameStream\n"); + for (int i=0; irunAsyncWaitSameStream(); + } + HIPCHECK(hipDeviceSynchronize()); + } + passed(); } From 48fdf2fc936d57be366d29e7687e0359f4b180d7 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 5 May 2017 17:19:37 -0500 Subject: [PATCH 069/171] Update streamEventTEst. - add checks for events across devices. - refactor test to make sure it runs long enough to sensitive sync techniques. - add tests for DeviceSync, streamWaitEvent. [ROCm/clr commit: 444e4a20ba757773cca1385646c4b4a4ab769234] --- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 321 ++++++++++++++---- 1 file changed, 250 insertions(+), 71 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 63c42da557..1d9ec45685 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -29,39 +29,47 @@ THE SOFTWARE. #include "test_common.h" #include #include -unsigned p_streams = 6; +unsigned p_streams = 8; unsigned p_db = 0; +unsigned p_count = 100; + template __global__ void -addOne( const T *A_d, +addCount( const T *A_d, T *C_d, - size_t NELEM) + size_t NELEM, + int count) { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); size_t stride = hipBlockDim_x * hipGridDim_x ; - for (size_t i=offset; i __global__ void -addOneReverse( const T *A_d, +addCountReverse( const T *A_d, T *C_d, - int64_t NELEM) + int64_t NELEM, + int count) { size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); size_t stride = hipBlockDim_x * hipGridDim_x ; - for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { - C_d[i] = A_d[i] + (T)1; - //C_d[i] = (T)1; - } + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i=0; i-=stride) { + C_d[i] = A_d[i] + (T)count; + } + } } @@ -70,41 +78,65 @@ addOneReverse( const T *A_d, template class Streamer { public: - Streamer(T *input, size_t numElements, bool reverse); + Streamer(int deviceId, T *input, size_t numElements, bool reverse); ~Streamer(); - void runAsyncAfter(Streamer *depStreamer); + void runAsyncAfter(Streamer *depStreamer, bool waitSameStream=false); void runAsyncWaitSameStream(); void queryUntilComplete(); - void syncAndCheck(int streamerNum, T initValue, T expectedOffset); + size_t check(int streamerNum, T initValue, T expectedOffset, bool expectPass=true); + void copyToHost(hipStream_t copyStream); hipEvent_t event() { return _event; }; + int deviceId() const { return _deviceId; }; + size_t mismatchCount() const { return _mismatchCount; }; T *C_d() { return _C_d; }; private: + T *_C_h; + T *_preA_d; // if input is on another device, this is pointer to that memory. T *_A_d; T *_C_d; hipStream_t _stream; hipEvent_t _event; + int _deviceId; size_t _numElements; bool _reverse; + + size_t _mismatchCount; }; template -Streamer::Streamer(T * A_d, size_t numElements, bool reverse) : +Streamer::Streamer(int deviceId, T * A_d, size_t numElements, bool reverse) : + _preA_d(NULL), _A_d(A_d), + _deviceId(deviceId), _numElements(numElements), _reverse(reverse) { size_t sizeElements = numElements * sizeof(int); + HIPCHECK(hipSetDevice(_deviceId)); + + + hipPointerAttribute_t attr; + HIPCHECK(hipPointerGetAttributes(&attr, A_d)); + if (attr.device != deviceId) { + // source is on another device, we will need to copy later. + // So save original source pointer and allocate local space. + printf ("info: source for streamer on another device, will insert memcpy\n"); + _preA_d = A_d; + HIPCHECK(hipMalloc(&_A_d, sizeElements)); + HIPCHECK(hipMemset(_A_d, -3, sizeElements)); + } + HIPCHECK(hipMalloc(&_C_d, sizeElements)); HIPCHECK(hipHostMalloc(&_C_h, sizeElements)); @@ -113,12 +145,16 @@ Streamer::Streamer(T * A_d, size_t numElements, bool reverse) : HIPCHECK(hipStreamCreate(&_stream)); HIPCHECK(hipEventCreate(&_event)); + + + }; template -void Streamer::runAsyncAfter(Streamer *depStreamer) +void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) { + HIPCHECK(hipSetDevice(_deviceId)); if (p_db) { printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); } @@ -127,36 +163,31 @@ void Streamer::runAsyncAfter(Streamer *depStreamer) HIPCHECK(hipStreamWaitEvent(_stream, depStreamer->event(), 0)); } - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - if (_reverse) { - hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); - } else { - hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); - } - HIPCHECK(hipEventRecord(_event, _stream)); -} - - -template -void Streamer::runAsyncWaitSameStream() -{ - printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - if (_reverse) { - hipLaunchKernelGGL(addOneReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); - } else { - hipLaunchKernelGGL(addOne, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements); + if (_preA_d) { + // _preA_d is on another device, so copy to local device so kernel can access it: + HIPCHECK(hipMemcpyAsync(_A_d, _preA_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); } - // Test case where hipStreamWaitEvent waits on same event we just placed into the queue. + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); + if (_reverse) { + hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + } else { + hipLaunchKernelGGL(addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + } HIPCHECK(hipEventRecord(_event, _stream)); - HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); + + if (waitSameStream) { + HIPCHECK(hipStreamWaitEvent(_stream, _event, 0)); // this is essentially a no-op, but make sure it doesn't crash + } } + template void Streamer::queryUntilComplete() { + HIPCHECK(hipSetDevice(_deviceId)); int numQueries = 0; hipError_t e = hipSuccess; do { @@ -168,19 +199,48 @@ void Streamer::queryUntilComplete() }; +// If copyStream is !nullptr it is used for the copy. template -void Streamer::syncAndCheck(int streamerNum, T initValue, T expectedOffset) +void Streamer::copyToHost(hipStream_t copyStream) { - HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, _stream)); - HIPCHECK(hipStreamSynchronize(_stream)); + if (p_db) { + printf ("db: copy back to host\n"); + } + HIPCHECK(hipSetDevice(_deviceId)); + HIPCHECK(hipMemcpyAsync(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost, copyStream ? copyStream : _stream)); + HIPCHECK(hipStreamSynchronize(copyStream ? copyStream:_stream)); +} + + +template +size_t Streamer::check(int streamerNum, T initValue, T expectedOffset, bool expectPass) +{ T expected = initValue + expectedOffset; + if (p_db) { + printf ("db: check\n"); + } + _mismatchCount = 0; for (size_t i=0; i<_numElements; i++) { if (_C_h[i] != expected) { - failed("for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + _mismatchCount++; + if (expectPass) { + fprintf(stderr, "for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + if (_mismatchCount > 10) { + failed("for streamer:%d _C_h[%zu] (%d) != expected(%d)\n", streamerNum, i, _C_h[i], expected); + } + } } } + + if (!expectPass && (_mismatchCount ==0)) { + // the test should run kernels long enough that if we don't correctly wait for them to finish then an error is reported. + //failed("for streamer:%d we expected inavalid synchronization to lead to mismatch but none was detected. Increase --N to sensitize sync.\n", streamerNum); + + } + + return _mismatchCount; } @@ -189,6 +249,8 @@ void Streamer::syncAndCheck(int streamerNum, T initValue, T expectedOffset) //Parse arguments specific to this test. void parseMyArguments(int argc, char *argv[]) { + N = 64*1024*1024; + int more_argc = HipTest::parseStandardArguments(argc, argv, false); // parse args for this test: @@ -199,6 +261,14 @@ void parseMyArguments(int argc, char *argv[]) if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { failed("Bad streams argument"); } + } else if (!strcmp(arg, "--count")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_count)) { + failed("Bad count argument"); + } + } else if (!strcmp(arg, "--db")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_db)) { + failed("Bad db argument"); + } } else { failed("Bad argument '%s'", arg); } @@ -206,6 +276,91 @@ void parseMyArguments(int argc, char *argv[]) }; +typedef Streamer IntStreamer; + + + + +void runStreamerLoop(std::vector &streamers) +{ + for (int i=0; irunAsyncAfter(i ? streamers[i-1] : NULL); + } +} + + +void checkAll(int initValue, std::vector &streamers, std::vector &sideStreams, bool expectPass=true) +{ + size_t mismatchCount=0; + + // Copy in reverse order to catch anything not yet finished... + for (int i=streamers.size()-1; i>=0; i--) { + streamers[i]->copyToHost(sideStreams.empty() ? NULL : sideStreams[streamers[i]->deviceId()]); + } + + + // Check in forward order so we can find first mismatch: + for (int i=0; icheck(i+1, initValue, (i+1)*p_count, expectPass); + + } + if (!expectPass && (mismatchCount==0)) { + // the test should run kernels long enough that if we don't correctly wait for them to finish then an error is reported. + failed("we expected inavalid synchronization to lead to mismatch but none was detected. Increase --count to sensitize sync.\n"); + } + +} + + + +#define RUN_SYNC_TEST(_enableBit, _streamers, _sync, _expectPass)\ + if (p_tests & (_enableBit)) {\ + printf ("==> Test %02x runAsyncAfter sync=%s\n", (_enableBit), #_sync);\ + runStreamerLoop(_streamers);\ + (_sync);\ + checkAll (initValue, _streamers, sideStreams, _expectPass);\ + } + + + + +//--- +// A family of sync functions which somehow wait for inflight activity to finish: + + +void sync_none(void) {}; + +void sync_allDevices(int numDevices) +{ + for (int d=0; d streamers) +{ + for (int i=0; iqueryUntilComplete(); + }; +} + + +void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) +{ + HIPCHECK(hipSetDevice(sideDeviceId)); + + // wait on the last event in the stream of chained streamers: + // This plants a marker which the subsquent copy for this device will wait on: + HIPCHECK(hipStreamWaitEvent(sideStream, lastEvent, 0)); + + if (waitHere) { + HIPCHECK(hipStreamSynchronize(sideStream)); + } +} + + //--- int main(int argc, char *argv[]) @@ -213,13 +368,17 @@ int main(int argc, char *argv[]) HipTest::parseStandardArguments(argc, argv, false); parseMyArguments(argc, argv); - typedef Streamer IntStreamer; + std::vector streamers; + std::vector streamersDev0; // streamers for first device. size_t numElements = N; size_t sizeElements = numElements * sizeof(int); + printf("info: sizeof arrays = %zu elements (%6.3f MB)\n", numElements, sizeElements / 1024.0/1024.0); + printf("info: streams=%d count=%d\n", p_streams, p_count); + assert (sizeElements <= std::numeric_limits::max()); @@ -234,45 +393,65 @@ int main(int argc, char *argv[]) HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice)); + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + numDevices =2; // TODO - remove me. - for (int i=0; iC_d() : initArray_d, numElements, i&1 /*reverse?*/); - streamers.push_back(s); - } - - if (p_tests & 0x1) { - printf ("==> Test 0x1 runAsyncAfter\n"); + for (int d=0; drunAsyncAfter(i ? streamers[i-1] : NULL); - } - HIPCHECK(hipDeviceSynchronize()); - - for (int i=0; isyncAndCheck(i+1, initValue, i+1); + IntStreamer * s = new IntStreamer(d, i ? streamers.back()->C_d() : initArray_d, numElements, i&1 /*reverse?*/); + streamers.push_back(s); + if (d==0) { + streamersDev0.push_back(s); + } } } - if (p_tests & 0x2) { - printf ("==> Test 0x2 queryUntilComplete\n"); - for (int i=0; irunAsyncAfter(i ? streamers[i-1] : NULL); - streamers[i]->queryUntilComplete(); - } - HIPCHECK(hipDeviceSynchronize()); + // A sideband stream channel that is independent from above. + // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is + // asynchronous wrt the other streams. + std::vector sideStreams; + for (int d=0; d Test 0x4 try null stream"); + + // Tests on first GPU: + RUN_SYNC_TEST(0x01, streamersDev0, sync_none(), false); + RUN_SYNC_TEST(0x02, streamersDev0, sync_allDevices(numDevices), true); + RUN_SYNC_TEST(0x04, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); + RUN_SYNC_TEST(0x08, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); + + if (numDevices > 1) { + // Sync on second device for activity running on device 0: + RUN_SYNC_TEST(0x10, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 1, sideStreams[1], true), true); + } + + + // Tests on all GPUs: + // RUN_SYNC_TEST(0x100, streamers, sync_streamWaitEvent(streamers.back()->event(), 0, sideStreams[0], false), true); + + + + + if (p_tests & 0x1000) { + printf ("==> Test 0x1000 try null stream\n"); hipStreamQuery(0/* try null stream*/); } - if (p_tests & 0x8) { - printf ("==> Test 0x8 runAsyncWaitSameStream\n"); - for (int i=0; irunAsyncWaitSameStream(); + + // Insert small wrinkle here, insert a wait on event just recorded, all in the same stream. + if (p_tests & 0x2000) { + printf ("==> Test 0x2000 runAsyncWaitSameStream\n"); + for (int i=0; irunAsyncAfter(i ? streamersDev0[i-1] : NULL, true/*waitSameStream*/); } - HIPCHECK(hipDeviceSynchronize()); + + sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false); + checkAll (initValue, streamersDev0, sideStreams); } From 4afda1720d44015e428cf25c7bcd3d66cb032bd5 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 5 May 2017 17:20:56 -0500 Subject: [PATCH 070/171] Refactor hipHostRegister test. Run all tests in one command. Run 128 offsets. [ROCm/clr commit: e417eb5d35ed64917de9edfcde02fdfd6e24ae41] --- .../hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp index 8cf0979261..3376ee04f1 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostRegister.cpp @@ -19,9 +19,7 @@ THE SOFTWARE. /* HIT_START * BUILD: %t %s ../../test_common.cpp - * RUN: %t --tests 0x1 - * RUN: %t --tests 0x2 - * RUN: %t --tests 0x4 + * RUN: %t * HIT_END */ @@ -131,7 +129,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipMalloc(&Bd, size)); // TODO - set to 128 -#define OFFSETS_TO_TRY 1 +#define OFFSETS_TO_TRY 128 assert (N>OFFSETS_TO_TRY); if (p_tests & 0x2) { From fa1ac559cbd85c30d93bb81b3263a95e9d55c94d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 5 May 2017 17:28:11 -0500 Subject: [PATCH 071/171] Fix some typos, add additional guidance for -BSymbolic [ROCm/clr commit: 9a026b62a8a4019d8125696081307cb5f6f4ecde] --- projects/clr/hipamd/docs/markdown/hip_bugs.md | 2 ++ projects/clr/hipamd/src/hip_memory.cpp | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_bugs.md b/projects/clr/hipamd/docs/markdown/hip_bugs.md index abb31d80e8..91b2a5a019 100644 --- a/projects/clr/hipamd/docs/markdown/hip_bugs.md +++ b/projects/clr/hipamd/docs/markdown/hip_bugs.md @@ -45,6 +45,8 @@ To correct, add the following flag to hcc or hipcc: $ hipcc -Wl,-Bsymbolic ... ``` +Ensure there is no space in the "Wl,-Bsymbolic" option. + ### What is the current limitation of HIP Generic Grid Launch method? 1. __global__ functions cannot be marked as static or put in an unnamed namespace i.e. they cannot be given internal linkage (this would clash with __attribute__((weak))); diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index c4bc7db096..1ba698f461 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -104,8 +104,8 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig auto device = ctx->getWriteableDevice(); ptr = hc::am_alloc(sizeBytes, device->_acc, amFlags); - tprintf(DB_MEM, " alloc %s ptr:%p size:%zu on dev:%d\n", - msg, ptr, sizeBytes, device->_deviceId); + tprintf(DB_MEM, " alloc %s ptr:%p-%p size:%zu on dev:%d\n", + msg, ptr, static_cast(ptr)+sizeBytes, sizeBytes, device->_deviceId); if (ptr != nullptr) { int r = sharePtr(ptr, ctx, hipFlags); From b5f0a9471d80a51bfa1f081bff46c68ae0f5f9ad Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 9 May 2017 10:14:16 -0500 Subject: [PATCH 072/171] added guard against hip_runtime.h so that non-hcc compilers can use it Change-Id: I3d68deda9ce8a5956e21e15a69e549d6c21e3e39 [ROCm/clr commit: 14930dc59452e4e34152213c5c8f3b57a227e086] --- projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h index 06ce65bc9a..4d8876d8f4 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -41,6 +41,8 @@ THE SOFTWARE. #include #endif//__cplusplus +#if __HCC__ + // Define NVCC_COMPAT for CUDA compatibility #define NVCC_COMPAT #define CUDA_SUCCESS hipSuccess @@ -481,6 +483,6 @@ do {\ */ - +#endif #endif//HIP_HCC_DETAIL_RUNTIME_H From 501d0b3161b257c635d96af706cd472908b6532f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 10 May 2017 13:23:49 -0500 Subject: [PATCH 073/171] Fix hipStreamWaitEvent for single GPU. [ROCm/clr commit: ae9fdf9bc10835aeeed662ae74da6f39548e926d] --- .../hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 1d9ec45685..adf0d4af0c 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -395,7 +395,7 @@ int main(int argc, char *argv[]) int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); - numDevices =2; // TODO - remove me. + numDevices = min(2, numDevices); // multi-GPU to 2 device. for (int d=0; d Date: Wed, 10 May 2017 17:32:25 -0500 Subject: [PATCH 074/171] hipHostMalloc allocation are mapped to all devices by default. Support hipHostMallocPortable flag. Default flags are hipHostMallocPortable | hipHostMallocMapped. Also: -refactor tests to move addCount and addCountReverse into HipTest namespace. -test multi-GPU host memory. [ROCm/clr commit: ff9bed653584a34cf5fafa3545a1ad1f22c74ca2] --- projects/clr/hipamd/src/hip_hcc.cpp | 9 ++ projects/clr/hipamd/src/hip_hcc_internal.h | 1 + projects/clr/hipamd/src/hip_memory.cpp | 58 ++++---- .../clr/hipamd/tests/src/hipPointerAttrib.cpp | 2 +- .../runtimeApi/memory/hipMemoryAllocate.cpp | 129 +++++++++++++----- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 40 +----- projects/clr/hipamd/tests/src/test_common.h | 38 ++++++ 7 files changed, 181 insertions(+), 96 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 71d947488d..81a2079b5b 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -118,6 +118,7 @@ bool g_visible_device = false; unsigned g_deviceCnt; std::vector g_hip_visible_devices; hsa_agent_t g_cpu_agent; +hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents. unsigned g_numLogicalThreads; std::atomic g_lastShortTid(1); @@ -1389,6 +1390,14 @@ void ihipInit() g_deviceCnt++; } } + + g_allAgents = static_cast (malloc((g_deviceCnt+1) * sizeof(hsa_agent_t))); + g_allAgents[0] = g_cpu_agent; + for (int i=0; i_hsaAgent; + } + + g_numLogicalThreads = std::thread::hardware_concurrency(); // If HIP_VISIBLE_DEVICES is not set, make sure all devices are initialized diff --git a/projects/clr/hipamd/src/hip_hcc_internal.h b/projects/clr/hipamd/src/hip_hcc_internal.h index 9c17c6e98c..132f099ce8 100644 --- a/projects/clr/hipamd/src/hip_hcc_internal.h +++ b/projects/clr/hipamd/src/hip_hcc_internal.h @@ -826,6 +826,7 @@ private: // Critical data, protected with locked access: extern std::once_flag hip_initialized; extern unsigned g_deviceCnt; extern hsa_agent_t g_cpu_agent ; // the CPU agent. +extern hsa_agent_t *g_allAgents; // CPU agents + all the visible GPU agents. //================================================================================================= // Extern functions: diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 1ba698f461..c4f0f64e50 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -59,31 +59,40 @@ hipError_t memcpyAsync (void* dst, const void* src, size_t sizeBytes, hipMemcpyK } // return 0 on success or -1 on error: -int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags) +int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags) { int ret = 0; auto device = ctx->getWriteableDevice(); hc::am_memtracker_update(ptr, device->_deviceId, hipFlags); - int peerCnt=0; - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - // the peerCnt always stores self so make sure the trace actually - peerCnt = crit->peerCnt(); - tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1); - if (peerCnt > 1) { - //printf ("peer self access\n"); + if (shareWithAll) { + hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr); + tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); + if (s != HSA_STATUS_SUCCESS) { + ret = -1; + } + } else { + int peerCnt=0; + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + // the peerCnt always stores self so make sure the trace actually + peerCnt = crit->peerCnt(); + tprintf(DB_MEM, " allow access to %d other peer(s)\n", peerCnt-1); + if (peerCnt > 1) { - // TODOD - remove me: - for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { - tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); - }; + //printf ("peer self access\n"); - hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); - if (s != HSA_STATUS_SUCCESS) { - ret = -1; + // TODOD - remove me: + for (auto iter = crit->_peers.begin(); iter!=crit->_peers.end(); iter++) { + tprintf (DB_MEM, " allow access to peer: %s%s\n", (*iter)->toString().c_str(), (iter == crit->_peers.begin()) ? " (self)":""); + }; + + hsa_status_t s = hsa_amd_agents_allow_access(crit->peerCnt(), crit->peerAgents(), NULL, ptr); + if (s != HSA_STATUS_SUCCESS) { + ret = -1; + } } } } @@ -96,7 +105,7 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, unsigned hipFlags) // Allocate a new pointer with am_alloc and share with all valid peers. // Returns null-ptr if a memory error occurs (either allocation or sharing) -void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsigned amFlags, unsigned hipFlags) +void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool shareWithAll, unsigned amFlags, unsigned hipFlags) { void *ptr = nullptr; @@ -108,7 +117,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, unsig msg, ptr, static_cast(ptr)+sizeBytes, sizeBytes, device->_deviceId); if (ptr != nullptr) { - int r = sharePtr(ptr, ctx, hipFlags); + int r = sharePtr(ptr, ctx, shareWithAll, hipFlags); if (r != 0) { ptr = nullptr; } @@ -220,7 +229,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) } else { auto device = ctx->getWriteableDevice(); - *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, 0/*amFlags*/, 0/*hipFlags*/); + *ptr = hip_internal::allocAndSharePtr("device_mem", sizeBytes, ctx, false/*shareWithAll*/, 0/*amFlags*/, 0/*hipFlags*/); if(sizeBytes && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; @@ -253,7 +262,8 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } else { unsigned trueFlags = flags; if (flags == hipHostMallocDefault) { - trueFlags = hipHostMallocMapped | hipHostMallocWriteCombined; + // HCC/ROCM provide a modern system with unified memory and should set both of these flags by default: + trueFlags = hipHostMallocMapped | hipHostMallocPortable; } const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined; @@ -265,8 +275,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) auto device = ctx->getWriteableDevice(); unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host", - sizeBytes, ctx, amFlags, flags); + sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags); + if(sizeBytes && (*ptr == NULL)){ hip_status = hipErrorMemoryAllocation; } @@ -314,7 +326,7 @@ hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height auto device = ctx->getWriteableDevice(); const unsigned am_flags = 0; - *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, am_flags, 0); + *ptr = hip_internal::allocAndSharePtr("device_pitch", sizeBytes, ctx, false/*shareWithAll*/, am_flags, 0); if (sizeBytes && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; @@ -373,7 +385,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc, hip_status = hipErrorUnknown; break; } - *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, am_flags, 0); + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false/*shareWithAll*/, am_flags, 0); if (size && (*ptr == NULL)) { hip_status = hipErrorMemoryAllocation; } diff --git a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp index fb7832d9a6..7a2ab64bea 100644 --- a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp +++ b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp @@ -99,7 +99,7 @@ inline int zrand(int max) //================================================================================================= -// Functins to run tests +// Functions to run tests //================================================================================================= //-- //Run through a couple simple cases to test lookups and host pointer arithmetic: diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index 0a256d6362..1ee5cbc9bb 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -25,45 +25,106 @@ THE SOFTWARE. #include"test_common.h" -#define SIZE 1024*1024*256 +#define NUM_ELEMENTS 1024*1024*64 +#define SIZE NUM_ELEMENTS*sizeof(int) -int main(){ - float *Ad, *B, *Bd, *Bm, *C, *Cd, *ptr_0; - B = (float*)malloc(SIZE); - hipMalloc((void**)&Ad, SIZE); - hipHostMalloc((void**)&B, SIZE); - hipHostMalloc((void**)&Bd, SIZE, hipHostMallocDefault); - hipHostMalloc((void**)&Bm, SIZE, hipHostMallocMapped); - hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped); - - hipHostGetDevicePointer((void**)&Cd, C, 0/*flags*/); - - HIPCHECK_API(hipMalloc((void**)&ptr_0,0), hipSuccess); - - HIPCHECK_API(hipFree(Ad) , hipSuccess); - HIPCHECK_API(hipHostFree(Ad) , hipErrorInvalidValue); - - HIPCHECK_API(hipFree(B) , hipErrorInvalidDevicePointer); // try to hipFree on malloced memory - HIPCHECK_API(hipFree(Bd) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipFree(Bm) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipFree(ptr_0) , hipSuccess); - HIPCHECK_API(hipHostFree(Bd) , hipSuccess); - HIPCHECK_API(hipHostFree(Bm) , hipSuccess); - - HIPCHECK_API(hipFree(C) , hipErrorInvalidDevicePointer); - HIPCHECK_API(hipHostFree(C) , hipSuccess); +int p_count = 4; - HIPCHECK_API(hipFree(NULL) , hipSuccess); - HIPCHECK_API(hipHostFree(NULL) , hipSuccess); +void multiGpuHostAlloc(int allocDevice) +{ + + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + assert(numDevices > 1); + + printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices); + + + HIPCHECK(hipSetDevice(allocDevice)); + + int *Ah, *Ch; + hipHostMalloc((void**)&Ah, SIZE); + hipHostMalloc((void**)&Ch, SIZE); + + const int init = -1; + for (size_t i=0; i 1); + + multiGpuHostAlloc(0); + multiGpuHostAlloc(1); } passed(); diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index adf0d4af0c..80ff7ad98d 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -35,42 +35,6 @@ unsigned p_count = 100; -template -__global__ void -addCount( const T *A_d, - T *C_d, - size_t NELEM, - int count) -{ - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; - - // Deliberately do this in an inefficient way to increase kernel runtime - for (int i=0; i -__global__ void -addCountReverse( const T *A_d, - T *C_d, - int64_t NELEM, - int count) -{ - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; - - // Deliberately do this in an inefficient way to increase kernel runtime - for (int i=0; i=0; i-=stride) { - C_d[i] = A_d[i] + (T)count; - } - } -} //------ @@ -171,9 +135,9 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); if (_reverse) { - hipLaunchKernelGGL(addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); } else { - hipLaunchKernelGGL(addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); } HIPCHECK(hipEventRecord(_event, _stream)); diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index 1250de4801..633ee6f825 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -146,6 +146,44 @@ vectorADD(hipLaunchParm lp, } +template +__global__ void +addCount( const T *A_d, + T *C_d, + size_t NELEM, + int count) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i +__global__ void +addCountReverse( const T *A_d, + T *C_d, + int64_t NELEM, + int count) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + // Deliberately do this in an inefficient way to increase kernel runtime + for (int i=0; i=0; i-=stride) { + C_d[i] = A_d[i] + (T)count; + } + } +} + + template void initArraysForHost(T **A_h, T **B_h, T **C_h, size_t N, bool usePinnedHost=false) From 42292df1a38e86f046e15c75fd7b798e2782e55c Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 11 May 2017 21:50:36 +0300 Subject: [PATCH 075/171] [HIPIFY] Fix string routines. Some Clang tooling functions return std::string, some return StringRef. Assigning of returning std::string to StringRef variables leads to garbage in it. DEBUG build is always affected. [ROCm/clr commit: aa52b94be1759ab5afa07f68a518bfee1ba5c85e] --- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 51 +++++++++---------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 1c6406b4ab..47434babac 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -2690,7 +2690,7 @@ private: bool cudaCall(const MatchFinder::MatchResult &Result) { if (const CallExpr *call = Result.Nodes.getNodeAs("cudaCall")) { const FunctionDecl *funcDcl = call->getDirectCallee(); - StringRef name = funcDcl->getDeclName().getAsString(); + std::string name = funcDcl->getDeclName().getAsString(); SourceManager *SM = Result.SourceManager; SourceLocation sl = call->getLocStart(); const auto found = N.cuda2hipRename.find(name); @@ -2714,16 +2714,16 @@ private: } } if (bReplace) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); Replacement Rep(*SM, sl, length, repName); FullSourceLoc fullSL(sl, *SM); insertReplacement(Rep, fullSL); } } else { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [function call]."; + std::string msg = "the following reference is not handled: '" + name + "' [function call]."; printHipifyMessage(*SM, sl, msg); } return true; @@ -2838,7 +2838,7 @@ private: bool cudaEnumConstantRef(const MatchFinder::MatchResult &Result) { if (const DeclRefExpr *enumConstantRef = Result.Nodes.getNodeAs("cudaEnumConstantRef")) { - StringRef name = enumConstantRef->getDecl()->getNameAsString(); + StringRef name = enumConstantRef->getDecl()->getName(); SourceLocation sl = enumConstantRef->getLocStart(); SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); @@ -2861,10 +2861,10 @@ private: bool cudaEnumDecl(const MatchFinder::MatchResult &Result) { if (const VarDecl *enumDecl = Result.Nodes.getNodeAs("cudaEnumDecl")) { - StringRef name = enumDecl->getType()->getAsTagDecl()->getNameAsString(); + std::string name = enumDecl->getType()->getAsTagDecl()->getNameAsString(); QualType QT = enumDecl->getType().getUnqualifiedType(); - StringRef name_unqualified = QT.getAsString(); - if ((name_unqualified.find(' ') == StringRef::npos && name.find(' ') == StringRef::npos) || name.empty()) { + std::string name_unqualified = QT.getAsString(); + if ((name_unqualified.find(' ') == std::string::npos && name.find(' ') == std::string::npos) || name.empty()) { name = name_unqualified; } // Workaround for enum VarDecl as param decl, declared with enum type specifier @@ -2882,7 +2882,7 @@ private: //------------------------------------------------- const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2890,7 +2890,7 @@ private: insertReplacement(Rep, fullSL); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [enum constant decl]."; + std::string msg = "the following reference is not handled: '" + name + "' [enum constant decl]."; printHipifyMessage(*SM, sl, msg); } return true; @@ -2905,12 +2905,12 @@ private: QT = QT.getTypePtr()->getAsArrayTypeUnsafe()->getElementType(); } QT = QT.getUnqualifiedType(); - StringRef name = QT.getAsString(); + std::string name = QT.getAsString(); SourceLocation sl = typedefVar->getLocStart(); SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2918,7 +2918,7 @@ private: insertReplacement(Rep, fullSL); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [typedef var]."; + std::string msg = "the following reference is not handled: '" + name + "' [typedef var]."; printHipifyMessage(*SM, sl, msg); } return true; @@ -2935,10 +2935,10 @@ private: SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); QualType QT = t->getPointeeType(); QT = QT.getUnqualifiedType(); - StringRef name = QT.getAsString(); + std::string name = QT.getAsString(); const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2947,7 +2947,7 @@ private: } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [typedef var ptr]."; + std::string msg = "the following reference is not handled: '" + name + "' [typedef var ptr]."; printHipifyMessage(*SM, sl, msg); } } @@ -2961,13 +2961,13 @@ private: QualType QT = structVar->getType(); // ToDo: find case-studies with types other than Struct. if (QT->isStructureType()) { - StringRef name = QT.getTypePtr()->getAsStructureType()->getDecl()->getNameAsString(); + std::string name = QT.getTypePtr()->getAsStructureType()->getDecl()->getNameAsString(); TypeLoc TL = structVar->getTypeSourceInfo()->getTypeLoc(); SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -2976,7 +2976,7 @@ private: } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [struct var]."; + std::string msg = "the following reference is not handled: '" + name + "' [struct var]."; printHipifyMessage(*SM, sl, msg); } } @@ -3049,7 +3049,7 @@ private: // Example: extern __shared__ uint sRadix1[]; if (sharedVar->hasExternalFormalLinkage()) { QualType QT = sharedVar->getType(); - StringRef typeName; + std::string typeName; if (QT->isIncompleteArrayType()) { const ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); QT = AT->getElementType(); @@ -3071,9 +3071,8 @@ private: SourceLocation slEnd = sharedVar->getLocEnd(); SourceManager *SM = Result.SourceManager; size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1; - SmallString<128> tmpData; - StringRef varName = sharedVar->getNameAsString(); - StringRef repName = Twine("HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")").toStringRef(tmpData); + std::string varName = sharedVar->getNameAsString(); + std::string repName = "HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")"; Replacement Rep(*SM, slStart, repLength, repName); FullSourceLoc fullSL(slStart, *SM); insertReplacement(Rep, fullSL); @@ -3089,7 +3088,7 @@ private: bool cudaParamDecl(const MatchFinder::MatchResult &Result) { if (const ParmVarDecl *paramDecl = Result.Nodes.getNodeAs("cudaParamDecl")) { QualType QT = paramDecl->getOriginalType().getUnqualifiedType(); - StringRef name = QT.getAsString(); + std::string name = QT.getAsString(); const Type *t = QT.getTypePtr(); if (t->isStructureOrClassType()) { name = t->getAsCXXRecordDecl()->getName(); @@ -3099,7 +3098,7 @@ private: SourceManager *SM = Result.SourceManager; const auto found = N.cuda2hipRename.find(name); if (found != N.cuda2hipRename.end()) { - updateCounters(found->second, name.str()); + updateCounters(found->second, name); if (!found->second.unsupported) { StringRef repName = found->second.hipName; Replacement Rep(*SM, sl, name.size(), repName); @@ -3107,7 +3106,7 @@ private: insertReplacement(Rep, fullSL); } } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [param decl]."; + std::string msg = "the following reference is not handled: '" + name + "' [param decl]."; printHipifyMessage(*SM, sl, msg); } return true; From 9b222c0ae44fa7e45868c285bb0d55638274f475 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 15:57:32 -0500 Subject: [PATCH 076/171] Add hipEventDisableSystemRelease flag. [ROCm/clr commit: 2c2625cb9e9714a8af64d5d46e9fb8bae96da97d] --- .../hipamd/include/hip/hcc_detail/hip_runtime_api.h | 1 + .../hipamd/include/hip/nvcc_detail/hip_runtime_api.h | 2 ++ projects/clr/hipamd/src/hip_hcc.cpp | 10 +++++++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 9cfd21c1d2..175fd64d29 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -106,6 +106,7 @@ enum hipLimit_t #define hipEventBlockingSync 0x1 ///< Waiting will yield CPU. Power-friendly and usage-friendly but may increase latency. #define hipEventDisableTiming 0x2 ///< Disable event's capability to record timing information. May improve performance. #define hipEventInterprocess 0x4 ///< Event can support IPC. @warning - not supported in HIP. +#define hipEventDisableSystemRelease 0x80000000 /// < Disable the system-scope release that event normally performs when it records. This flag is useful to obtain more precise timings of commands between events. The flag is a no-op on CUDA platforms. //! Flags that can be used with hipHostMalloc diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index aad3ffcc44..e9f926b336 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -58,6 +58,8 @@ hipMemcpyHostToHost #define hipEventBlockingSync cudaEventBlockingSync #define hipEventDisableTiming cudaEventDisableTiming #define hipEventInterprocess cudaEventInterprocess +#define hipEventDisableSystemRelease cudaEventDefault /* no-op on CUDA platform */ + #define hipHostMallocDefault cudaHostAllocDefault #define hipHostMallocPortable cudaHostAllocPortable diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 81a2079b5b..e936cf3af3 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -56,6 +56,9 @@ THE SOFTWARE. #define USE_ROCR_1_4 1 #endif +// needs HCC change for hc::no_scope +#define USE_NO_SCOPE 0 + //================================================================================================= //Global variables: //================================================================================================= @@ -364,8 +367,13 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) LockedAccessor_StreamCrit_t crit(_criticalData); this->ensureHaveQueue(crit); +#if USE_NO_SCOPE + printf ("create_marker, flags = %x\n", event->_flags); + event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); +#else event->_marker = crit->_av.create_marker(); -} +#endif +}; //============================================================================= From 0d3c99eb6ebd98a558d610726c89d6458dda96c3 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 16:05:28 -0500 Subject: [PATCH 077/171] Remove old USE_ switches no longer needed. [ROCm/clr commit: c7c62dd022a06713198cb8c791680501d4b693ea] --- projects/clr/hipamd/src/hip_hcc.cpp | 32 ----------------------------- 1 file changed, 32 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index e936cf3af3..12f1792c33 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -48,14 +48,6 @@ THE SOFTWARE. #include "env.h" -#ifndef USE_COPY_EXT_V2 -#define USE_COPY_EXT_V2 1 -#endif - -#ifndef USE_ROCR_1_4 -#define USE_ROCR_1_4 1 -#endif - // needs HCC change for hc::no_scope #define USE_NO_SCOPE 0 @@ -107,10 +99,6 @@ int HCC_OPT_FLUSH = 0; -#define HIP_USE_PRODUCT_NAME 1 -//#define DISABLE_COPY_EXT 1 - - std::once_flag hip_initialized; // Array of pointers to devices. @@ -857,11 +845,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) // Get Max Threads Per Multiprocessor uint32_t max_waves_per_cu; -#if USE_ROCR_1_4 err = hsa_agent_get_info(_hsaAgent,(hsa_agent_info_t) HSA_AMD_AGENT_INFO_MAX_WAVES_PER_CU, &max_waves_per_cu); -#else - max_waves_per_cu = 10; -#endif DeviceErrorCheck(err); prop-> maxThreadsPerMultiProcessor = prop->warpSize*max_waves_per_cu; @@ -1919,11 +1903,7 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, this->ensureHaveQueue(crit); -#if USE_COPY_EXT_V2 crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); -#else - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); -#endif } } @@ -2031,18 +2011,10 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes this->ensureHaveQueue(crit); if (HIP_FORCE_SYNC_COPY) { -#if USE_COPY_EXT_V2 crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc, forceUnpinnedCopy); -#else - crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); -#endif } else { -#if USE_COPY_EXT_V2 crit->_av.copy_async_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc); -#else - crit->_av.copy_async(src, dst, sizeBytes); -#endif } } catch (Kalmar::runtime_exception) { throw ihipException(hipErrorRuntimeOther); @@ -2075,11 +2047,7 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes this->ensureHaveQueue(crit); -#if USE_COPY_EXT_V2 crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); -#else - crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, forceUnpinnedCopy); -#endif } } } From feca2ed0dca2dd77e0d8cf04c76f3d5561964c7f Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 12 May 2017 21:43:34 -0500 Subject: [PATCH 078/171] added gfx900 to hipDeviceProp_t Change-Id: I49e7a32f218926fd55f1c94c5dc2366d6c8ac4ca [ROCm/clr commit: a6dc00f1675ad79ef5a27fe0b4965d10bdb7a1b8] --- projects/clr/hipamd/src/hip_hcc.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 12f1792c33..a655e35aa1 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -772,6 +772,9 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) if(strcmp(archName,"gfx803")==0){ prop->gcnArch = 803; } + if(strcmp(archName,"gfx900")==0){ + prop->gcnArch = 900; + } DeviceErrorCheck(err); From 37d5a094930bb3f67d80efa0225ddca5129bf43a Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 15 May 2017 15:35:52 +0300 Subject: [PATCH 079/171] [HIPIFY] CUDA Driver API: Primary Context Management support. [ROCm/clr commit: 181d3e2bae0750b870584fa14a192c98d41ed514] --- projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 47434babac..72cbcaf52a 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -858,6 +858,13 @@ struct cuda2hipMap { cuda2hipRename["cuCtxSetLimit"] = {"hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuCtxGetLimit"] = {"hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + // Primary Context Management + cuda2hipRename["cuDevicePrimaryCtxGetState"] = {"hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxRelease"] = {"hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxRetain"] = {"hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxReset"] = {"hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxSetFlags"] = {"hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER}; + // Device cuda2hipRename["cuDeviceGet"] = {"hipGetDevice", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetName"] = {"hipDeviceGetName", CONV_DEV, API_DRIVER}; From 293524f1dcf973170e3d72bedb7cd2a8f0bd0e5b Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 16 May 2017 07:15:13 +0530 Subject: [PATCH 080/171] Added hipMallocPitch on HIP/NVCC path Change-Id: Ie3ba7d3f95acac23805efa919531043b350a3f21 [ROCm/clr commit: d22b731f95b4fc1e2b959bee6dbc2fb530c0991a] --- projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index e9f926b336..69a9b46570 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -204,6 +204,10 @@ inline static hipError_t hipMalloc(void** ptr, size_t size) { return hipCUDAErrorTohipError(cudaMalloc(ptr, size)); } +inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) { + return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height)); +} + inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); } From b2fa897dedb78cb4cb95a27fd332f1463023e345 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 16 May 2017 18:21:25 +0300 Subject: [PATCH 081/171] [HIPIFY] cudaMallocPitch -> hipMallocPitch [ROCm/clr commit: 221faeb916a481350be4284eec60e0d168294912] --- projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 72cbcaf52a..0c6b0f1efc 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -1088,7 +1088,7 @@ struct cuda2hipMap { cuda2hipRename["cudaMalloc3DArray"] = {"hipMalloc3DArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMallocManaged"] = {"hipMallocManaged", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMallocMipmappedArray"] = {"hipMallocMipmappedArray", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaMallocPitch"] = {"hipMallocPitch", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaMallocPitch"] = {"hipMallocPitch", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaFree"] = {"hipFree", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaFreeHost"] = {"hipHostFree", CONV_MEM, API_RUNTIME}; From 9a097161acda6639616f01abdc064e939cf4725f Mon Sep 17 00:00:00 2001 From: emankov Date: Tue, 16 May 2017 19:52:39 +0300 Subject: [PATCH 082/171] [HIPIFY] *.inl extension support for batch processing [ROCm/clr commit: f6a0cb3afca26ff4850bae6e5928389f29326423] --- projects/clr/hipamd/bin/findcode.sh | 2 +- projects/clr/hipamd/bin/hipexamine.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/bin/findcode.sh b/projects/clr/hipamd/bin/findcode.sh index a2334b3e2d..d092d6bf8d 100755 --- a/projects/clr/hipamd/bin/findcode.sh +++ b/projects/clr/hipamd/bin/findcode.sh @@ -2,4 +2,4 @@ SEARCH_DIRS=$@ -find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp' +find $SEARCH_DIRS -name '*.cpp' -o -name '*.h' -o -name '*.cu' -o -name '*.cuh' -o -name '*.c' -o -name '*.hpp' -o -name '*.inl' diff --git a/projects/clr/hipamd/bin/hipexamine.sh b/projects/clr/hipamd/bin/hipexamine.sh index 2a6fab7110..79e1b469f9 100755 --- a/projects/clr/hipamd/bin/hipexamine.sh +++ b/projects/clr/hipamd/bin/hipexamine.sh @@ -1,6 +1,6 @@ #!/bin/bash -#usage : hipexamine2.sh DIRNAME [hipify options] [--] [clang options] +#usage : hipexamine.sh DIRNAME [hipify options] [--] [clang options] # Generate CUDA->HIP conversion statistics for all the code files in the specified directory. From ee05975efad248e686814b71adf53a4617affedc Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 27 Jan 2017 11:21:08 -0600 Subject: [PATCH 083/171] Add HIP_TRACE_API=4. Only display memory allocation/free apis. [ROCm/clr commit: 7e7ba5027fcdfdeabaa716f0c2f6894387b79066] --- projects/clr/hipamd/README.md | 1 + .../clr/hipamd/docs/markdown/hip_profiling.md | 5 ++ projects/clr/hipamd/src/hip_hcc.cpp | 2 +- projects/clr/hipamd/src/hip_hcc_internal.h | 16 ++++--- projects/clr/hipamd/src/hip_memory.cpp | 46 +++++++++---------- 5 files changed, 39 insertions(+), 31 deletions(-) diff --git a/projects/clr/hipamd/README.md b/projects/clr/hipamd/README.md index f61c3b106a..d54032c6df 100644 --- a/projects/clr/hipamd/README.md +++ b/projects/clr/hipamd/README.md @@ -32,6 +32,7 @@ HIP releases are typically of two types. The tag naming convention is different - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) - [HIP Porting Guide](docs/markdown/hip_porting_guide.md) - [HIP Porting Driver Guide](docs/markdown/hip_porting_driver_api.md) +- [HIP Profiling and Debugging](docs/markdown/hip_profiling.md) - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) - [hipify-clang](hipify-clang/README.md) - [Developer/CONTRIBUTING Info](CONTRIBUTING.md) diff --git a/projects/clr/hipamd/docs/markdown/hip_profiling.md b/projects/clr/hipamd/docs/markdown/hip_profiling.md index 6e5cde700d..b5c4672464 100644 --- a/projects/clr/hipamd/docs/markdown/hip_profiling.md +++ b/projects/clr/hipamd/docs/markdown/hip_profiling.md @@ -267,6 +267,11 @@ info: check result PASSED! ``` +HIP_TRACE_API supports multiple levels of debug information: + - 0x1 = print all HIP APIs + - 0x2 = print HIP APIs which initiate GPU kernels, copies, or memsets. Includes hipLaunchKernel, hipMemcpy*, hipMemset*. + - 0x4 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree. + #### Color Note this trace mode uses colors. "less -r" can handle raw control characters and will display the debug output in proper colors. diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index a655e35aa1..07604fe85d 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -1435,7 +1435,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) void ihipPrintKernelLaunch(const char *kernelName, const grid_launch_parm *lp, const hipStream_t stream) { - if ((HIP_TRACE_API & (1< dpitch || width > spitch) return ihipLogStatus(hipErrorUnknown); @@ -826,7 +826,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind) { - HIP_INIT_CMD_API(dst, wOffset, hOffset, src, spitch, width, height, kind); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, spitch, width, height, kind); hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); @@ -879,7 +879,7 @@ hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset, con hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset, size_t hOffset, const void* src, size_t count, hipMemcpyKind kind) { - HIP_INIT_CMD_API(dst, wOffset, hOffset, src, count, kind); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, wOffset, hOffset, src, count, kind); hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); @@ -938,7 +938,7 @@ ihipMemsetKernel(hipStream_t stream, // TODO-sync: function is async unless target is pinned host memory - then these are fully sync. hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t stream ) { - HIP_INIT_CMD_API(dst, value, sizeBytes, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes, stream); hipError_t e = hipSuccess; @@ -988,7 +988,7 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) { - HIP_INIT_CMD_API(dst, value, sizeBytes); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes); hipError_t e = hipSuccess; @@ -1148,7 +1148,7 @@ hipError_t hipMemPtrGetInfo(void *ptr, size_t *size) hipError_t hipFree(void* ptr) { - HIP_INIT_API(ptr); + HIP_INIT_SPECIAL_API((TRACE_MEM), ptr); hipError_t hipStatus = hipErrorInvalidDevicePointer; @@ -1176,7 +1176,7 @@ hipError_t hipFree(void* ptr) hipError_t hipHostFree(void* ptr) { - HIP_INIT_API(ptr); + HIP_INIT_SPECIAL_API((TRACE_MEM), ptr); // Synchronize to ensure all work has finished. ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits for all activity to finish. @@ -1210,7 +1210,7 @@ hipError_t hipFreeHost(void* ptr) hipError_t hipFreeArray(hipArray* array) { - HIP_INIT_API(array); + HIP_INIT_SPECIAL_API((TRACE_MEM), array); hipError_t hipStatus = hipErrorInvalidDevicePointer; From a8d917c09211afdffaf00dec5e35c8b096c2dbdc Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 14 Feb 2017 21:50:16 -0600 Subject: [PATCH 084/171] split debugging into separate .md file [ROCm/clr commit: 61c863311335a1557e862696b48596afc8c9ab46] --- projects/clr/hipamd/README.md | 3 +- .../src/runtimeApi/stream/hipNullStream.cpp | 200 ++++++++++++++++++ 2 files changed, 202 insertions(+), 1 deletion(-) create mode 100644 projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp diff --git a/projects/clr/hipamd/README.md b/projects/clr/hipamd/README.md index d54032c6df..d04d63714f 100644 --- a/projects/clr/hipamd/README.md +++ b/projects/clr/hipamd/README.md @@ -32,7 +32,8 @@ HIP releases are typically of two types. The tag naming convention is different - [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) - [HIP Porting Guide](docs/markdown/hip_porting_guide.md) - [HIP Porting Driver Guide](docs/markdown/hip_porting_driver_api.md) -- [HIP Profiling and Debugging](docs/markdown/hip_profiling.md) +- [HIP Profiling ](docs/markdown/hip_profiling.md) +- [HIP Debugging](docs/markdown/hip_debugging.md) - [HIP Terminology](docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) - [hipify-clang](hipify-clang/README.md) - [Developer/CONTRIBUTING Info](CONTRIBUTING.md) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp new file mode 100644 index 0000000000..f8d201cb51 --- /dev/null +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -0,0 +1,200 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + + +#include "hip/hip_runtime.h" +#include "test_common.h" +#include +unsigned p_streams = 6; +int p_repeat = 10; + + +template +__global__ void +vectorADDRepeat(hipLaunchParm lp, + const T *A_d, + const T *B_d, + T *C_d, + size_t NELEM, + int repeat) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int j=1; j<=repeat;j++) { + for (size_t i=offset; i +class Streamer { +public: + Streamer(size_t numElements, bool useNullStream=false); + ~Streamer(); + void enqueAsync(); + void queryUntilComplete(); + + +public: + T *_A_h; + T *_B_h; + T *_C_h; + + T *_A_d; + T *_B_d; + T *_C_d; + + hipStream_t _stream; + hipEvent_t _event; + + size_t _numElements; +}; + +template +Streamer::Streamer(size_t numElements, bool useNullStream) : + _numElements(numElements) +{ + HipTest::initArrays (&_A_d, &_B_d, &_C_d, &_A_h, &_B_h, &_C_h, numElements, true); + + if (useNullStream) { + _stream = 0x0; + } else { + HIPCHECK(hipStreamCreate(&_stream)); + } + HIPCHECK(hipEventCreate(&_event)); +}; + +template +void Streamer::enqueAsync() +{ + printf ("testing: %s numElements=%zu size=%6.2fMB\n", __func__, _numElements, _numElements * sizeof(T) / 1024.0/1024.0); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _B_d, _C_d, _numElements, p_repeat); + +} + +template +void Streamer::queryUntilComplete() +{ + int numQueries = 0; + hipError_t e = hipSuccess; + do { + numQueries++; + e = hipStreamQuery(_stream); + } while (e != hipSuccess) ; + + printf ("completed after %d queries\n", numQueries); +}; + + + +//--- +//Parse arguments specific to this test. +void parseMyArguments(int argc, char *argv[]) +{ + int more_argc = HipTest::parseStandardArguments(argc, argv, false); + + // parse args for this test: + for (int i = 1; i < more_argc; i++) { + const char *arg = argv[i]; + + if (!strcmp(arg, "--streams")) { + if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { + failed("Bad streams argument"); + } + } else { + failed("Bad argument '%s'", arg); + } + }; +}; + + + + + +//--- +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, false); + parseMyArguments(argc, argv); + + typedef Streamer FloatStreamer; + + std::vector streamers; + + size_t numElements = N; + + float *expected_H = (float*)malloc(numElements*sizeof(float)); + + + auto nullStreamer = new FloatStreamer(numElements, true); + for (size_t i=0; i_A_h[i]*p_repeat + nullStreamer->_B_h[i] * p_repeat; + } + + + for (int i=0; i Test 0x1 runAsnc\n"); + for (int i=0; ienqueAsync(); + } + + auto lastStreamer = streamers[p_streams - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete. + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + HIPCHECK(hipMemcpy(nullStreamer->_C_h, nullStreamer->_C_d, numElements*sizeof(float), hipMemcpyDeviceToHost)); + HIPCHECK(hipStreamSynchronize(0)); + + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + } + + + if (p_tests & 0x2) { + printf ("==> Test 0x2 runAsnc-odd-only\n"); + for (int i=0; ienqueAsync(); + } + } + } + + + passed(); +} From b8b6cfe02ec3d4911c5ea8713f6ecbb1470a811e Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 17:01:35 -0500 Subject: [PATCH 085/171] Doc update - split hip_debugging.md into separate file. [ROCm/clr commit: a5a12942b2298b7ecc6c9af76c2fc1e1908cb10f] --- .../clr/hipamd/docs/markdown/hip_debugging.md | 168 ++++++++++++++++++ projects/clr/hipamd/docs/markdown/hip_faq.md | 4 +- .../clr/hipamd/docs/markdown/hip_profiling.md | 160 +---------------- 3 files changed, 171 insertions(+), 161 deletions(-) create mode 100644 projects/clr/hipamd/docs/markdown/hip_debugging.md diff --git a/projects/clr/hipamd/docs/markdown/hip_debugging.md b/projects/clr/hipamd/docs/markdown/hip_debugging.md new file mode 100644 index 0000000000..e7e058d17a --- /dev/null +++ b/projects/clr/hipamd/docs/markdown/hip_debugging.md @@ -0,0 +1,168 @@ +Table of Contents +================= + + * [Profiling HIP Code](#profiling-hip-code" aria-hidden="true">" in the printed output. +This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function.. +The gdb syntax also supports using the variable name (in this case 'dst'): +``` +(gdb) p dst +$33 = (void *) 0x5ec7e9000 +(gdb) call hc::am_memtracker_print(dst) +TargetAddress:0x5ec7e9000 + 0x504cfc000-0x504cfc00f:: allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) +... +-->0x5ec7e9000-0x5f7e28fff:: allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) + +``` + +To debug an explicit address, cast the address to (void*) : +``` +(gdb) call hc::am_memtracker_print((void*)0x508c7f000) +``` +- Debugging GPUVM fault. +For example: +``` +Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege. + +Program received signal SIGABRT, Aborted. +[Switching to Thread 0x7fffdffb5700 (LWP 14893)] +0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 +56 ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory. +(gdb) bt +#0 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 +#1 0x00007ffff205b028 in __GI_abort () at abort.c:89 +#2 0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#3 0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#4 0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#5 0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312 +#6 0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111 +(gdb) info threads + Id Target Id Frame + 4 Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 + 3 Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 +* 2 Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 + 1 Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +(gdb) thread 1 +[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))] +#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +(gdb) bt +#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#1 0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 +#2 0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so +#3 0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so +#4 0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so +#5 0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so +#6 0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15 +... +``` + +### General Debugging Tips +- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU. So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above. +- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3. This will force HCC to wait for the kernel to finish executing before retuning. If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace. A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so. +- VM faults inside kernels can be caused byi: + - incorrect code (ie a for loop which extends past array boundaries), i + - memory issues - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers). + - synchronization issues + - compiler issues (incorrect code generation from the compiler) + - runtime issues + +-- General debug tips: +- 'gdb --args' can be used to conviently pass the executable and arguments to gdb. +- From inside GDB, you can set environment variables "set env". Note the command does not use an '=' sign: +``` +(gdb) set env HIP_DB 1 +``` + +#### Print env var state +Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info. +Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info. diff --git a/projects/clr/hipamd/docs/markdown/hip_faq.md b/projects/clr/hipamd/docs/markdown/hip_faq.md index e316d449ef..07ec5f1d8b 100644 --- a/projects/clr/hipamd/docs/markdown/hip_faq.md +++ b/projects/clr/hipamd/docs/markdown/hip_faq.md @@ -53,7 +53,7 @@ At a high-level, the following features are not supported: - Dynamic parallelism (CUDA 5.0) - Managed memory (CUDA 6.5) - Graphics interoperability with OpenGL or Direct3D -- CUDA Driver API (Under Development) +- CUDA Driver API - CUDA IPC Functions (Under Development) - CUDA array, mipmappedArray and pitched memory - MemcpyToSymbol functions @@ -102,7 +102,7 @@ However, we can provide a rough summary of the features included in each CUDA SD - Per-thread-streams (under development) - C++11 (HCC supports all of C++11, all of C++14 and some C++17 features) - CUDA 7.5 - - float16 (under development) + - float16 - CUDA 8.0 - TBD. diff --git a/projects/clr/hipamd/docs/markdown/hip_profiling.md b/projects/clr/hipamd/docs/markdown/hip_profiling.md index b5c4672464..ef349ef2a5 100644 --- a/projects/clr/hipamd/docs/markdown/hip_profiling.md +++ b/projects/clr/hipamd/docs/markdown/hip_profiling.md @@ -1,4 +1,4 @@ -# Profiling and Debugging HIP Code +# Profiling HIP Code This section describes the profiling and debugging capabilities that HIP provides. Profiling information can viewed in the CodeXL visualization tool or printed directly to stderr as the application runs. @@ -280,161 +280,3 @@ None will disable use of color control codes for both the opening and closing an -### Using HIP_DB - -This flag is primarily targeted to assist HIP development team in the development of the HIP runtime, but in some situations may be useful to HIP application developers as well. -The HIP debug information is designed to print important information during the execution of a HIP API. HIP provides -different color-coded levels of debug information: - - api : Print the beginning and end of each HIP API, including the arguments and return codes. This is equivalent to setting HIP_TRACE_API=1. - - sync : Print multi-thread and other synchronization debug information. - - copy : Print which engine is doing the copy, which copy flavor is selected, information on source and destination memory. - - mem : Print information about memory allocation - which pointers are allocated, where they are allocated, peer mappings, and more. - -DB_MEM format is flags separated by '+' sign, or a hex code for the bitmask. Generally the + format is preferred. -For example: -``` -$ HIP_DB=api+copy+mem my-application -$ HIP_DB=0xF my-application -``` - -### Using ltrace -ltrace is a standard linux tool which provides a message to stderr on every dynamic library call. Since ROCr and the ROCt (the ROC thunk, which is the thin user-space interface to the ROC kernel driver) are both dynamic libraries, this provides an easy way to trace the activity in these libraries. Tracing can be a powerful way to quickly observe the flow of the application before diving into the details with a command-line debugger. -The trace can also show performance issues related to accidental calls to expensive API calls on the critical path. - -ltrace can be easily combined with the HIP_DB switches to visualize the runtime behavior of the entire ROCm software stack. Here's a sample command-line and output: - -``` -$ HIP_DB=api ltrace -C -e 'hsa*' - -... - -<hsa_signal_store_relaxed(0x1804000, 0, 0, 0x400000) = 0 -libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1816000, 0, 0x7f777f85f2a0, 0x400000) = 0 -libmcwamp_hsa.so->hsa_amd_memory_lock(0x7f7776d3e010, 0x400000, 0x1213b70, 1 -libhsa-runtime64.so.1->hsaKmtRegisterMemoryToNodes(0x7f7776d3e010, 0x400000, 1, 0x1220c10) = 0 -libhsa-runtime64.so.1->hsaKmtMapMemoryToGPUNodes(0x7f7776d3e010, 0x400000, 0x7ffc32865400, 64) = 0 -<... hsa_amd_memory_lock resumed> ) = 0 -libmcwamp_hsa.so->hsa_signal_store_relaxed(0x1804000, 1, 0x7f777e95a770, 0x12205b0) = 0 -libmcwamp_hsa.so->hsa_amd_memory_async_copy(0x50411d010, 0x11e70d0, 0x503d1d000, 0x11e70d0) = 0 -libmcwamp_hsa.so->hsa_signal_wait_acquire(0x1804000, 2, 1, -1) = 0 -libmcwamp_hsa.so->hsa_amd_memory_unlock(0x7f7776d3e010, 0x1213c6c, 0x12c3c600000000, 0x1804000 -libhsa-runtime64.so.1->hsaKmtUnmapMemoryToGPU(0x7f7776d3e010, 0x7f7776d3e010, 0x12c3c600000000, 0x1804000) = 0 -libhsa-runtime64.so.1->hsaKmtDeregisterMemory(0x7f7776d3e010, 0x7f7776d3e010, 0x7f777f60f9e8, 0x1220580) = 0 -<... hsa_amd_memory_unlock resumed> ) = 0 - hip-api tid:1.17 hipMemcpy ret= 0 (hipSuccess)>> -``` - -Some key information from the trace above. - - Thy trace snippet shows the execution of a hipMemcpy API, bracketed by the first and last message in the trace output. The messages show the thread id and API sequence number (`1.17`). ltrace output intermixes messages from all threads, so the HIP debug information can be useful to determine which threads are executing. - - The code flows through HIP APIs into ROCr (HSA) APIs (hsa*) and into the thunk (hsaKmt*) calls. - - The HCC runtime is "libmcwamp_hsa.so" and the HSA/ROCr runtime is "libhsa-runtime64.so". - - In this particular case, the memory copy is for unpinned memory, and the selected copy algorithm is to pin the host memory "in-place" before performing the copy. The signaling APIs and calls to pin ("lock", "register") the memory are readily apparent in the trace output. - - -### Chicken bits -Chicken bits are environment variables which cause the HIP, HCC, or HSA driver to disable some feature or optimization. -These are not intended for production but can be useful diagnose synchronization problems in the application (or driver). - -Some of the most useful chicken bits are described here. These bits are supported on the ROCm path: - -HIP provides 3 environment variables in the HIP_*_BLOCKING family. These introduce additional synchronization and can be useful to isolate synchronization problems. Specifically, if the code works with this flag set, then it indicates the kernels are executing correctly, and any failures likely are causes by improper or missing synchronization. These flags will have performance impact and are not intended for production use. - -- HIP_LAUNCH_BLOCKING=1 : Waits on the host after each kernel launch. Equivalent to setting CUDA_LAUNCH_BLOCKING. -- HIP_LAUNCH_BLOCKING_KERNELS: A comma-separated list of kernel names. The HIP runtime will wait on the host after one of the named kernels executes. This provides a more targeted version of HIP_LAUNCH_BLOCKING and may be useful to isolate exactly which kernel needs further analysis if HIP_LAUNCH_BLOCKING=1 improves functionality. There is no indication if kernel names are spelled incorrectly. One mechanism to verify that the blocking is working is to run with HIP_DB=api+sync and search for debug messages with "LAUNCH_BLOCKING". -- HIP_API_BLOCKING : Forces hipMemcpyAsync and hipMemsetAsync to be host-synchronous, meaning they will wait for the requested operation to complete before returning to the caller. - -These options cause HCC to serialize. Useful if you have libraries or code which is calling HCC kernels directly rather than using HIP. -- HCC_SERIALZIE_KERNELS : 0x1=pre-serialize before each kernel launch, 0x2=post-serialize after each kernel launch., 0x3= pre- and post- serialize. -- HCC_SERIALIZE_COPY : 0x1=pre-serialize before each async copy, 0x2=post-serialize after each async copy., 0x3= pre- and post- serialize. - -- HSA_ENABLE_SDMA=0 : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines. Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine. This flag is useful to isolate issues with the hardware copy engines. -- HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts. Can be useful to diagnose interrupt storm issues in the driver. -- HSA_DISABLE_CACHE=1 : Disables the GPU L2 data cache. - -### Debugging HIP Applications - -- The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread. This can be useful for setting conditional breakpoints. Also, each new HIP thread is mapped to monotically increasing shortTid ID. Both of these fields are displayed in the HIP debug info. -``` -(gdb) p tls_tidInfo -$32 = {_shortTid = 1, _apiSeqNum = 803} -``` - -- HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc". -If the HCC runtime is built with debug information (HCC_RUNTIME_DEBUG=ON when building HCC), then calling the function 'hc::am_memtracker_print()' will show all memory allocations. -An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output. -This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function.. -The gdb syntax also supports using the variable name (in this case 'dst'): -``` -(gdb) p dst -$33 = (void *) 0x5ec7e9000 -(gdb) call hc::am_memtracker_print(dst) -TargetAddress:0x5ec7e9000 - 0x504cfc000-0x504cfc00f:: allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) -... --->0x5ec7e9000-0x5f7e28fff:: allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil) - -``` - -To debug an explicit address, cast the address to (void*) : -``` -(gdb) call hc::am_memtracker_print((void*)0x508c7f000) -``` -- Debugging GPUVM fault. -For example: -``` -Memory access fault by GPU node-1 on address 0x5924000. Reason: Page not present or supervisor privilege. - -Program received signal SIGABRT, Aborted. -[Switching to Thread 0x7fffdffb5700 (LWP 14893)] -0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 -56 ../nptl/sysdeps/unix/sysv/linux/raise.c: No such file or directory. -(gdb) bt -#0 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 -#1 0x00007ffff205b028 in __GI_abort () at abort.c:89 -#2 0x00007ffff6f960eb in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#3 0x00007ffff6f99ea5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#4 0x00007ffff6f78107 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#5 0x00007ffff744f184 in start_thread (arg=0x7fffdffb5700) at pthread_create.c:312 -#6 0x00007ffff211b37d in clone () at ../sysdeps/unix/sysv/linux/x86_64/clone.S:111 -(gdb) info threads - Id Target Id Frame - 4 Thread 0x7fffdd521700 (LWP 14895) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 - 3 Thread 0x7fffddd22700 (LWP 14894) "caffe" pthread_cond_wait@@GLIBC_2.3.2 () at ../nptl/sysdeps/unix/sysv/linux/x86_64/pthread_cond_wait.S:185 -* 2 Thread 0x7fffdffb5700 (LWP 14893) "caffe" 0x00007ffff2057c37 in __GI_raise (sig=sig@entry=6) at ../nptl/sysdeps/unix/sysv/linux/raise.c:56 - 1 Thread 0x7ffff7fa6ac0 (LWP 14892) "caffe" 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -(gdb) thread 1 -[Switching to thread 1 (Thread 0x7ffff7fa6ac0 (LWP 14892))] -#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -(gdb) bt -#0 0x00007ffff6f934d5 in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#1 0x00007ffff6f929ba in ?? () from /opt/rocm/hsa/lib/libhsa-runtime64.so.1 -#2 0x00007fffe080beca in HSADispatch::waitComplete() () from /opt/rocm/hcc/lib/libmcwamp_hsa.so -#3 0x00007fffe080415f in HSADispatch::dispatchKernelAsync(Kalmar::HSAQueue*, void const*, int, bool) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so -#4 0x00007fffe080238e in Kalmar::HSAQueue::dispatch_hsa_kernel(hsa_kernel_dispatch_packet_s const*, void const*, unsigned long, hc::completion_future*) () from /opt/rocm/hcc/lib/libmcwamp_hsa.so -#5 0x00007ffff7bb7559 in hipModuleLaunchKernel () from /opt/rocm/hip/lib/libhip_hcc.so -#6 0x00007ffff2e6cd2c in mlopen::HIPOCKernel::run (this=0x7fffffffb5a8, args=0x7fffffffb2a8, size=80) at /root/MIOpen/src/hipoc/hipoc_kernel.cpp:15 -... -``` - -### General Debugging Tips -- The fault will be caught by the runtime but was actually generated by an asynchronous command running on the GPU. So, the GDB backtrace will show a path in the runtime, ie inside "GI_Raise" as shown in the example above. -- To determine the true location of the fault, force the kernels to execute synchronously by seeing the environment variables HCC_SERIALIZE_KERNEL=3 HCC_SERIALIZE_COPY=3. This will force HCC to wait for the kernel to finish executing before retuning. If the fault occurs during the execution of a kernel, you can see the code which launched the kernel inside the backtrace. A bit of guesswork is required to determine which thread is actually causing the issue - typically it will the thread which is waiting inside the libhsa-runtime64.so. -- VM faults inside kernels can be caused byi: - - incorrect code (ie a for loop which extends past array boundaries), i - - memory issues - kernel arguments which are invalid (null pointers, unregistered host pointers, bad pointers). - - synchronization issues - - compiler issues (incorrect code generation from the compiler) - - runtime issues - --- General debug tips: -- 'gdb --args' can be used to conviently pass the executable and arguments to gdb. -- From inside GDB, you can set environment variables "set env". Note the command does not use an '=' sign: -``` -(gdb) set env HIP_DB 1 -``` -Setting HIP_PRINT_ENV=1 and then running a HIP application will print the HIP environment variables, their current values, and usage info. -Setting HCC_PRINT_ENV=1 and then running a HCC application will print the HCC environment variables, their current values, and usage info. - - - From db102ab82f49e424a4412b1b0b7c142acbf95002 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 17:02:31 -0500 Subject: [PATCH 086/171] Update tests README [ROCm/clr commit: 5d2072aba10169f6ad6e631282fdda8fbb44c68f] --- projects/clr/hipamd/tests/README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/clr/hipamd/tests/README.md b/projects/clr/hipamd/tests/README.md index 223bd149dc..cb41cc10cd 100644 --- a/projects/clr/hipamd/tests/README.md +++ b/projects/clr/hipamd/tests/README.md @@ -59,5 +59,9 @@ Find the test and commandline that fail: grep -IR hipMemcpy-modes -IR ../tests/ ../tests/src/runtimeApi/memory/hipMemcpy.cpp: * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 +# Guidelines for adding new tests +- Prefer to enhance an existing test as opposed to writing a new one. Tests have overhead to start and many small tests spend precious test time on startup and initialization issues. +- Make the test run standalone without requirement for command-line arguments. THis makes it easier to debug since the name of the test is shown in the test report and if you know the name of the test you can the run the test. +- For long-running tests or tests with multiple phases, consider using the --tests option as an optional mechanism to allow debuggers to start with the failing subset of the test. From 4ac6ac9d1d66b200468ec2e5324cadd68cebd567 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 12 May 2017 17:04:23 -0500 Subject: [PATCH 087/171] Add initial HIP_SYNC_NULL_STREAM=0 mode. This eliminates host-synchronization for null stream. Instead, the null-stream uses GPU-side events to wait for other streams. Default is OFF pending additional testing. Add enhanced null-stream test. Also refine HIP_TRACE_API. [ROCm/clr commit: 8bc6ee5932967e732f41ec327f280e8be67c932e] --- .../include/hip/hcc_detail/hip_runtime_api.h | 7 +- projects/clr/hipamd/src/grid_launch.cpp | 2 +- projects/clr/hipamd/src/hip_device.cpp | 2 +- projects/clr/hipamd/src/hip_event.cpp | 16 +- projects/clr/hipamd/src/hip_hcc.cpp | 145 +++++++++++----- projects/clr/hipamd/src/hip_hcc_internal.h | 12 +- projects/clr/hipamd/src/hip_memory.cpp | 8 +- projects/clr/hipamd/src/hip_module.cpp | 4 +- projects/clr/hipamd/src/hip_stream.cpp | 4 +- .../src/runtimeApi/stream/hipNullStream.cpp | 156 ++++++++++++++---- projects/clr/hipamd/tests/src/test_common.h | 60 ++++++- 11 files changed, 320 insertions(+), 96 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 175fd64d29..e1aecef1e8 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -602,9 +602,12 @@ hipError_t hipStreamQuery(hipStream_t stream); * * @return #hipSuccess, #hipErrorInvalidResourceHandle * - * If the null stream is specified, this command blocks until all + * This command is host-synchronous : the host will block until the specified stream is empty. + * + * This command follows standard null-stream semantics. Specifically, specifying the null stream will cause the + * command to wait for other streams on the same device to complete all pending operations. + * * This command honors the hipDeviceLaunchBlocking flag, which controls whether the wait is active or blocking. - * This command is host-synchronous : the host will block until the stream is empty. * * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamWaitEvent, hipStreamDestroy * diff --git a/projects/clr/hipamd/src/grid_launch.cpp b/projects/clr/hipamd/src/grid_launch.cpp index cac01df7dc..ffa50dec95 100644 --- a/projects/clr/hipamd/src/grid_launch.cpp +++ b/projects/clr/hipamd/src/grid_launch.cpp @@ -52,7 +52,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream) { - if ((HIP_TRACE_API & (1 << TRACE_CMD)) || + if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API || (COMPILE_HIP_DB && HIP_TRACE_API)) { std::stringstream os; diff --git a/projects/clr/hipamd/src/hip_device.cpp b/projects/clr/hipamd/src/hip_device.cpp index 01a213190f..93c1c20484 100644 --- a/projects/clr/hipamd/src/hip_device.cpp +++ b/projects/clr/hipamd/src/hip_device.cpp @@ -298,7 +298,7 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_t* props, int device) hipError_t hipGetDeviceProperties(hipDeviceProp_t* props, int device) { HIP_INIT_API(props, device); - return ihipGetDeviceProperties(props, device); + return ihipLogStatus(ihipGetDeviceProperties(props, device)); } hipError_t hipSetDeviceFlags( unsigned int flags) diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index 61ac5cd3ab..fbaf5cc463 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -114,14 +114,17 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) HIP_INIT_API(event, stream); if (event && event->_state != hipEventStatusUnitialized) { + stream = ihipSyncAndResolveStream(stream); + event->_stream = stream; - if (stream == NULL) { + if (HIP_SYNC_NULL_STREAM && stream == NULL) { + + // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0 + // If stream == NULL, wait on all queues. - // TODO-HCC fix this - is this conservative or still uses device timestamps? - // TODO-HCC can we use barrier or event marker to implement better solution? ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true); + ctx->locked_syncDefaultStream(true, true); event->_timestamp = hc::get_system_ticks(); event->_state = hipEventStatusRecorded; @@ -164,9 +167,10 @@ hipError_t hipEventSynchronize(hipEvent_t event) } else if (event->_state == hipEventStatusCreated ) { // Created but not actually recorded on any device: return ihipLogStatus(hipSuccess); - } else if (event->_stream == NULL) { + } else if (HIP_SYNC_NULL_STREAM && (event->_stream == NULL)) { auto *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true); + // TODO-HIP_SYNC_NULL_STREAM - can remove this code + ctx->locked_syncDefaultStream(true, true); return ihipLogStatus(hipSuccess); } else { event->_marker.wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 07604fe85d..979a2e5028 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -92,6 +92,9 @@ int HIP_COHERENT_HOST_ALLOC = 0; // USE_ HIP_SYNC_HOST_ALLOC int HIP_SYNC_HOST_ALLOC = 1; +// Sync on host between +int HIP_SYNC_NULL_STREAM = 1; + int HCC_OPT_FLUSH = 0; @@ -289,6 +292,32 @@ inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCri assert(streamCrit->_hasQueue); } +hc::hcWaitMode ihipStream_t::waitMode() const +{ + hc::hcWaitMode waitMode = hc::hcWaitModeActive; + + if (_scheduleMode == Auto) { + if (g_deviceCnt > g_numLogicalThreads) { + waitMode = hc::hcWaitModeActive; + } else { + waitMode = hc::hcWaitModeBlocked; + } + } else if (_scheduleMode == Spin) { + waitMode = hc::hcWaitModeActive; + } else if (_scheduleMode == Yield) { + waitMode = hc::hcWaitModeBlocked; + } else { + assert(0); // bad wait mode. + } + + if (HIP_WAIT_MODE == 1) { + waitMode = hc::hcWaitModeBlocked; + } else if (HIP_WAIT_MODE == 2) { + waitMode = hc::hcWaitModeActive; + } + + return waitMode; +} //Wait for all kernel and data copy commands in this stream to complete. //This signature should be used in routines that already have locked the stream mutex @@ -296,29 +325,8 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit) { if (crit->_hasQueue) { tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); - hc::hcWaitMode waitMode = hc::hcWaitModeActive; - if (_scheduleMode == Auto) { - if (g_deviceCnt > g_numLogicalThreads) { - waitMode = hc::hcWaitModeActive; - } else { - waitMode = hc::hcWaitModeBlocked; - } - } else if (_scheduleMode == Spin) { - waitMode = hc::hcWaitModeActive; - } else if (_scheduleMode == Yield) { - waitMode = hc::hcWaitModeBlocked; - } else { - assert(0); // bad wait mode. - } - - if (HIP_WAIT_MODE == 1) { - waitMode = hc::hcWaitModeBlocked; - } else if (HIP_WAIT_MODE == 2) { - waitMode = hc::hcWaitModeActive; - } - - crit->_av.wait(waitMode); + crit->_av.wait(waitMode()); } else { tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str()); } @@ -337,7 +345,7 @@ void ihipStream_t::locked_wait() }; // Causes current stream to wait for specified event to complete: -// Note this does not require any kind of host serialization. +// Note this does not provide any kind of host serialization. void ihipStream_t::locked_waitEvent(hipEvent_t event) { LockedAccessor_StreamCrit_t crit(_criticalData); @@ -1061,26 +1069,57 @@ ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit) // Implement "default" stream syncronization // This waits for all other streams to drain before continuing. // If waitOnSelf is set, this additionally waits for the default stream to empty. -void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf) +// In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to wait for other +// activity, but doesn't actually block the host. If host blocking is desired, the caller should set syncHost. +// Note HIP_SYNC_NULL_STREAM=1 path always sync to Host. +void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) { LockedAccessor_CtxCrit_t crit(_criticalData); - tprintf(DB_SYNC, "syncDefaultStream\n"); + tprintf(DB_SYNC, "syncDefaultStream \n"); + + // Vector of ops sent to each stream that will complete before ops sent to null stream: + std::vector depOps; for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) { ihipStream_t *stream = *streamI; - // Don't wait for streams that have "opted-out" of syncing with NULL stream. - // And - don't wait for the NULL stream - if (!(stream->_flags & hipStreamNonBlocking)) { + if (HIP_SYNC_NULL_STREAM) { - if (waitOnSelf || (stream != _defaultStream)) { - // TODO-hcc - use blocking or active wait here? - // TODO-sync - cudaDeviceBlockingSync - stream->locked_wait(); + // Don't wait for streams that have "opted-out" of syncing with NULL stream. + // And - don't wait for the NULL stream + if (!(stream->_flags & hipStreamNonBlocking)) { + + if (waitOnSelf || (stream != _defaultStream)) { + stream->locked_wait(); + } + } + } else { + if (!(stream->_flags & hipStreamNonBlocking) && (stream != _defaultStream)) { + LockedAccessor_StreamCrit_t streamCrit(stream->_criticalData); + + // The last marker will provide appropriate visibility: + if (!streamCrit->_av.get_is_empty()) { + depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope)); + } } } } + + + + // Enqueue a barrier to wait on all the barriers we sent above: + if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) { + LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->_criticalData); + tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams\n", depOps.size()); + hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker(depOps.begin(), depOps.end(), hc::accelerator_scope); + if (syncHost) { + defaultCf.wait(); // TODO - account for active or blocking here. + } + } + + tprintf(DB_SYNC, " syncDefaultStream depOps=%zu\n", depOps.size()); + } @@ -1267,6 +1306,7 @@ void HipReadEnv() READ_ENV_I(release, HIP_FAIL_SOC, 0, "Fault on Sub-Optimal-Copy, rather than use a slower but functional implementation. Bit 0x1=Fail on async copy with unpinned memory. Bit 0x2=Fail peer copy rather than use staging buffer copy"); READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations. May help stability"); + READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions"); // TODO - review, can we remove this? READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); @@ -1274,7 +1314,7 @@ void HipReadEnv() READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); - READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impact HCC. When set, use agent-scope flush rather than system-scope flush when possible."); + READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impacts HCC. When set, use agent-scope flush rather than system-scope flush when possible."); // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. if (HIP_DB && !COMPILE_HIP_DB) { @@ -1415,17 +1455,44 @@ void ihipInit() hipStream_t ihipSyncAndResolveStream(hipStream_t stream) { if (stream == hipStreamNull ) { - ihipCtx_t *device = ihipGetTlsDefaultCtx(); + ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); #ifndef HIP_API_PER_THREAD_DEFAULT_STREAM - device->locked_syncDefaultStream(false); + ctx->locked_syncDefaultStream(false, false); #endif - return device->_defaultStream; + return ctx->_defaultStream; } else { - // ALl streams have to wait for legacy default stream to be empty: + // All streams have to wait for legacy default stream to be empty: if (!(stream->_flags & hipStreamNonBlocking)) { - tprintf(DB_SYNC, "%s wait default stream\n", ToString(stream).c_str()); - stream->getCtx()->_defaultStream->locked_wait(); + if (HIP_SYNC_NULL_STREAM) { + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); + stream->getCtx()->_defaultStream->locked_wait(); + } else { + ihipStream_t *defaultStream = stream->getCtx()->_defaultStream; + + tprintf(DB_SYNC, "%s marker wait default stream\n", ToString(stream).c_str()); + + bool needMarker = false; + hc::completion_future dcf; + { + LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData()); + // TODO - could call create_blocking_marker(queue) + if (!defaultStreamCrit->_av.get_is_empty()) { + needMarker = true; + + // TODO - add "none_scope". + dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope); + } + } + + if (needMarker) { + // ensure any commands sent to this stream wait on the NULL stream before continuing + LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); + // TODO - could be "noret" version of create_blocking_marker + thisStreamCrit->_av.create_blocking_marker(dcf); + } + } } return stream; diff --git a/projects/clr/hipamd/src/hip_hcc_internal.h b/projects/clr/hipamd/src/hip_hcc_internal.h index 7787242ca7..0d080f9225 100644 --- a/projects/clr/hipamd/src/hip_hcc_internal.h +++ b/projects/clr/hipamd/src/hip_hcc_internal.h @@ -66,6 +66,8 @@ extern int HIP_COHERENT_HOST_ALLOC; // Chicken bits for disabling functionality to work around potential issues: extern int HIP_SYNC_HOST_ALLOC; +extern int HIP_SYNC_NULL_STREAM; + // TODO - remove when this is standard behavior. extern int HCC_OPT_FLUSH; @@ -187,11 +189,11 @@ extern const char *API_COLOR_END; //--- -//HIP Trace modes +//HIP Trace modes - use with HIP_TRACE_API=... #define TRACE_ALL 0 // 0x1 #define TRACE_KCMD 1 // 0x2, kernel command #define TRACE_MCMD 2 // 0x4, memory command -#define TRACE_MEM 3 // 0x8 +#define TRACE_MEM 3 // 0x8, memory allocation or deallocation. //--- @@ -276,7 +278,7 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr); API_TRACE(0, __VA_ARGS__); -// Like above, but will trace with TRACE_CMD. +// Like above, but will trace with a specified "special" bit. // Replace HIP_INIT_API with this call inside HIP APIs that launch work on the GPU: // kernel launches, copy commands, memory sets, etc. #define HIP_INIT_SPECIAL_API(tbit, ...) \ @@ -521,8 +523,10 @@ public: void locked_waitEvent(hipEvent_t event); void locked_recordEvent(hipEvent_t event); + ihipStreamCritical_t &criticalData() { return _criticalData; }; //--- + hc::hcWaitMode waitMode() const; // Use this if we already have the stream critical data mutex: void wait(LockedAccessor_StreamCrit_t &crit); @@ -786,7 +790,7 @@ public: // Functions: void locked_removeStream(ihipStream_t *s); void locked_reset(); void locked_waitAllStreams(); - void locked_syncDefaultStream(bool waitOnSelf); + void locked_syncDefaultStream(bool waitOnSelf, bool syncHost); // Will allocate a queue and assign it to the needyStream: hc::accelerator_view stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream); diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index cef676b572..5501fec734 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -525,7 +525,7 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t cou hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind) { - HIP_INIT_CMD_API(symbolName, dst, count, offset, kind); + HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind); if(symbolName == nullptr) { @@ -598,7 +598,7 @@ hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_ hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName, size_t count, size_t offset, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_CMD_API(symbolName, dst, count, offset, kind, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), symbolName, dst, count, offset, kind, stream); if(symbolName == nullptr) { @@ -807,7 +807,7 @@ hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch, size_t width, size_t height, hipMemcpyKind kind, hipStream_t stream) { - HIP_INIT_CMD_API(dst, dpitch, src, spitch, width, height, kind, stream); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, dpitch, src, spitch, width, height, kind, stream); if(width > dpitch || width > spitch) return ihipLogStatus(hipErrorUnknown); hipError_t e = hipSuccess; @@ -1041,7 +1041,7 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes ) { - HIP_INIT_CMD_API(dst, value, sizeBytes); + HIP_INIT_SPECIAL_API((TRACE_MCMD), dst, value, sizeBytes); hipError_t e = hipSuccess; diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index b359e7a63c..da01f23769 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -352,14 +352,14 @@ hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char *func = sym; hmod->funcTrack.push_back(*func); } - return ihipLogStatus(ret); + return ret; } hipError_t hipModuleGetFunction(hipFunction_t *hfunc, hipModule_t hmod, const char *name){ HIP_INIT_API(hfunc, hmod, name); - return ihipModuleGetSymbol(hfunc, hmod, name); + return ihipLogStatus(ihipModuleGetSymbol(hfunc, hmod, name)); } diff --git a/projects/clr/hipamd/src/hip_stream.cpp b/projects/clr/hipamd/src/hip_stream.cpp index d7f8717725..34b4bc8851 100644 --- a/projects/clr/hipamd/src/hip_stream.cpp +++ b/projects/clr/hipamd/src/hip_stream.cpp @@ -150,7 +150,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) if (stream == NULL) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/); + ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/); } else { stream->locked_wait(); e = hipSuccess; @@ -174,7 +174,7 @@ hipError_t hipStreamDestroy(hipStream_t stream) //--- Drain the stream: if (stream == NULL) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/); + ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true /*syncToHost*/); } else { stream->locked_wait(); e = hipSuccess; diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp index f8d201cb51..380979f6bc 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -27,8 +27,9 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" #include -unsigned p_streams = 6; +unsigned p_streams =16; int p_repeat = 10; +int p_db = 0; template @@ -45,7 +46,7 @@ vectorADDRepeat(hipLaunchParm lp, for (int j=1; j<=repeat;j++) { for (size_t i=offset; i::Streamer(size_t numElements, bool useNullStream) : HIPCHECK(hipStreamCreate(&_stream)); } HIPCHECK(hipEventCreate(&_event)); + + H2D(); + }; +template +void Streamer::H2D() +{ + HIPCHECK(hipMemcpy(_A_d, _A_h, _numElements*sizeof(T), hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(_B_d, _B_h, _numElements*sizeof(T), hipMemcpyHostToDevice)); +} + +template +void Streamer::D2H() +{ + HIPCHECK(hipMemcpy(_C_h, _C_d, _numElements*sizeof(T), hipMemcpyDeviceToHost)); +} + +template +void Streamer::reset() +{ + HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h); + H2D(); + +} + + template void Streamer::enqueAsync() { @@ -131,6 +161,10 @@ void parseMyArguments(int argc, char *argv[]) if (++i >= argc || !HipTest::parseUInt(argv[i], &p_streams)) { failed("Bad streams argument"); } + } else if (!strcmp(arg, "--repeat") || (!strcmp(arg, "-r"))) { + if (++i >= argc || !HipTest::parseInt(argv[i], &p_repeat)) { + failed("Bad repeat argument"); + } } else { failed("Bad argument '%s'", arg); } @@ -138,6 +172,15 @@ void parseMyArguments(int argc, char *argv[]) }; +void +printBuffer(std::string name, int *f, size_t numElements) +{ + std::cout << name << "\n"; + for (size_t i=0; i FloatStreamer; + typedef Streamer IntStreamer; - std::vector streamers; + std::vector streamers; size_t numElements = N; - float *expected_H = (float*)malloc(numElements*sizeof(float)); + int *expected_H = (int*)malloc(numElements*sizeof(int)); - auto nullStreamer = new FloatStreamer(numElements, true); + auto nullStreamer = new IntStreamer(numElements, true); + + // Expected resultr - last streamer runs vectorADDRepeat, then nullstreamer adds lastStreamer->_C_d + lastStreamer->_C_d for (size_t i=0; i_A_h[i]*p_repeat + nullStreamer->_B_h[i] * p_repeat; + expected_H[i] = ((nullStreamer->_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat) *2; } for (int i=0; i Test 0x1 runAsnc\n"); - for (int i=0; ienqueAsync(); + for (int s=1; s Test %x runAsnc, #streams=%d\n", (1<reset(); + + for (int i=0; ienqueAsync(); + } + + auto lastStreamer = streamers[s - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + + + if (p_db) { + HIPCHECK(hipDeviceSynchronize()); + lastStreamer->D2H(); + printBuffer("lastStream _A_h", lastStreamer->_A_h, min(numElements, size_t(20))); + printBuffer("lastStream _B_h", lastStreamer->_B_h, min(numElements, size_t(20))); + printBuffer("lastStream _C_h", lastStreamer->_C_h, min(numElements, size_t(20))); + } + nullStreamer->D2H(); + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } - - auto lastStreamer = streamers[p_streams - 1]; - - // Dispatch to NULL stream, should wait for prior async activity to complete. - unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); - hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); - HIPCHECK(hipMemcpy(nullStreamer->_C_h, nullStreamer->_C_d, numElements*sizeof(float), hipMemcpyDeviceToHost)); - HIPCHECK(hipStreamSynchronize(0)); - - - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } - if (p_tests & 0x2) { - printf ("==> Test 0x2 runAsnc-odd-only\n"); - for (int i=0; ienqueAsync(); + for (int s=1; sreset(); + printf ("==> Test %x runAsnc-odd-only, #streams=%d\n", tmask, s); + for (int i=0; ienqueAsync(); + } } + auto lastStreamer = streamers[s - 1]; + + // Dispatch to NULL stream, should wait for prior async activity to complete before beginning: + hipLaunchKernel(vectorADDRepeat, dim3(blocks), dim3(threadsPerBlock), 0, 0/*nullstream*/, lastStreamer->_C_d, lastStreamer->_C_d, nullStreamer->_C_d, numElements, 1/*repeat*/); + + nullStreamer->D2H(); + + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } + // Expected resultr - last streamer runs vectorADDRepeat + for (size_t i=0; i_A_h[i])*p_repeat + (nullStreamer->_B_h[i]) * p_repeat); + } + + if (p_tests & 0x20000) { + + assert (p_streams >=2); // need a couple streams in order to run this test. + nullStreamer->reset(); + printf ("\n==> Test hipStreamSynchronize with defaultStream \n"); + + // Enqueue a long-running job to stream1 + streamers[0]->enqueAsync(); + + // Check to see if synchronizing on a null stream synchronizes all other streams or just the null stream. + // This function follows null stream semantics and will wait for all other blocking streams before returning. + // This will wait on the host + HIPCHECK(hipStreamSynchronize(0)); + + // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams. + HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream)); + + + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); + } + passed(); } diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index 633ee6f825..1a6e51e08e 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -184,6 +184,20 @@ addCountReverse( const T *A_d, } +void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) +{ + // Initialize the host data: + for (size_t i=0; i void initArraysForHost(T **A_h, T **B_h, T **C_h, size_t N, bool usePinnedHost=false) @@ -217,15 +231,10 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h, } } - // Initialize the host data: - for (size_t i=0; i void initArrays(T **A_d, T **B_d, T **C_d, T **A_h, T **B_h, T **C_h, @@ -367,6 +376,43 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true } +// Assumes C_h contains vector add of A_h + B_h +// Calls the test "failed" macro if a mismatch is detected. +template +void checkTest(T* expected_H, T* result_H, size_t N, bool expectMatch=true) +{ + size_t mismatchCount = 0; + size_t firstMismatch = 0; + size_t mismatchesToPrint = 10; + for (size_t i=0; i Date: Sat, 13 May 2017 16:00:26 +0000 Subject: [PATCH 088/171] Fix HIP_TRACE_API so kernel launch only printed when requested. [ROCm/clr commit: fad9104b5fc34e44f860c13a5e549e251009e875] --- projects/clr/hipamd/src/grid_launch.cpp | 2 +- projects/clr/hipamd/tests/src/test_common.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/src/grid_launch.cpp b/projects/clr/hipamd/src/grid_launch.cpp index ffa50dec95..f3b28c5f60 100644 --- a/projects/clr/hipamd/src/grid_launch.cpp +++ b/projects/clr/hipamd/src/grid_launch.cpp @@ -54,7 +54,7 @@ namespace hip_impl { if ((HIP_TRACE_API & (1 << TRACE_KCMD)) || HIP_PROFILE_API || - (COMPILE_HIP_DB && HIP_TRACE_API)) { + (COMPILE_HIP_DB && (HIP_TRACE_API & (1< void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) { // Initialize the host data: From 33cfd1a35e81fff5def97ad477680a69aec445c1 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 16 May 2017 18:56:40 -0500 Subject: [PATCH 089/171] Make hipMultiThreadStreams1 test a little harsher. Fail faster if synchronization rules are violated. Run vectorAddRevers to read last elements of array first - if the vector add kernel starts before preceding copy finishes we will read stale data and flag the error. Increase default array sizes, so synchronization errors more easily exposed. [ROCm/clr commit: 848d77ebb579b59915267bde22d4d727acc40900] --- .../multiThread/hipMultiThreadStreams1.cpp | 45 +++++++++++++++---- projects/clr/hipamd/tests/src/test_common.h | 39 ++++++++++++---- 2 files changed, 66 insertions(+), 18 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/projects/clr/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp index 229ceea440..4f73b67ad7 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp @@ -29,6 +29,8 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" +int p_iters=10; + void printSep() { printf ("======================================================================================\n"); @@ -43,7 +45,7 @@ template< class P=HipTest::Unpinned, class C=HipTest::Memcpy > -void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) +void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) { using HipTest::MemTraits; @@ -57,6 +59,24 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) T *A_h, *B_h, *C_h; HipTest::initArrays (&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, P::isPinned); + for (size_t i=0; i::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream); + MemTraits::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream); + MemTraits::Copy(C_d, C_h, Nbytes, hipMemcpyHostToDevice, stream); + HIPCHECK (hipDeviceSynchronize()); + + for (size_t i=0; i::Copy(A_d, A_h, Nbytes, hipMemcpyHostToDevice, stream); MemTraits::Copy(B_d, B_h, Nbytes, hipMemcpyHostToDevice, stream); - hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + //HIPCHECK(hipStreamSynchronize(stream)); + + // This is the null stream? + //hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); + hipLaunchKernel(HipTest::vectorADDReverse, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d, B_d, C_d, numElements); MemTraits::Copy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost, stream); @@ -76,9 +100,9 @@ void simpleVectorCopy(size_t numElements, int iters, hipStream_t stream) } HipTest::freeArrays (A_d, B_d, C_d, A_h, B_h, C_h, P::isPinned); + std::cout <<" pid" << pid << " success\n"; HIPCHECK (hipDeviceSynchronize()); - std::cout <<" pid" << pid << " success\n"; } template @@ -88,12 +112,14 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s printf ("%s\n", __func__); std::cout << testName << std::endl; + size_t numElements = N; + // Test 2 threads operating on same stream: - std::thread t1 (simpleVectorCopy, 2000000/*mb*/, 100/*iters*/, stream0); + std::thread t1 (simpleVectorAdd, numElements, p_iters/*iters*/, stream0); if (serialize) { t1.join(); } - std::thread t2 (simpleVectorCopy, 2000000/*mb*/, 100/*iters*/, stream1); + std::thread t2 (simpleVectorAdd, numElements, p_iters/*iters*/, stream1); if (serialize) { t2.join(); } @@ -109,6 +135,7 @@ void test_multiThread_1(std::string testName, hipStream_t stream0, hipStream_t s int main(int argc, char *argv[]) { + N = 8000000; HipTest::parseStandardArguments(argc, argv, true); printf ("info: set device to %d\n", p_gpuDevice); @@ -121,8 +148,8 @@ int main(int argc, char *argv[]) hipStream_t stream; HIPCHECK (hipStreamCreate(&stream)); - simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); - simpleVectorCopy (2000000/*mb*/, 10/*iters*/, stream); + simpleVectorAdd (N/*mb*/, 10/*iters*/, stream); + simpleVectorAdd (N/*mb*/, 10/*iters*/, stream); HIPCHECK(hipStreamDestroy(stream)); } @@ -139,8 +166,8 @@ int main(int argc, char *argv[]) } if (p_tests & 0x4) { - test_multiThread_1 ("Multithread with NULL stream", NULL, NULL, false); - test_multiThread_1 ("Multithread with two streams", stream0, stream1, false); + //test_multiThread_1 ("Multithread with NULL stream", NULL, NULL, false); + //test_multiThread_1 ("Multithread with two streams", stream0, stream1, false); test_multiThread_1 ("Multithread with one stream", stream0, stream0, false); } diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index 2c6905eea2..bb44c94745 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -146,6 +146,23 @@ vectorADD(hipLaunchParm lp, } +template +__global__ void +vectorADDReverse(hipLaunchParm lp, + const T *A_d, + const T *B_d, + T *C_d, + size_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { + C_d[i] = A_d[i] + B_d[i]; + } +} + + template __global__ void addCount( const T *A_d, @@ -343,7 +360,7 @@ inline void initHIPArrays(hipArray **A_d, hipArray **B_d, hipArray **C_d, // Assumes C_h contains vector add of A_h + B_h // Calls the test "failed" macro if a mismatch is detected. template -void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true) +size_t checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true, bool reportMismatch=true) { size_t mismatchCount = 0; size_t firstMismatch = 0; @@ -364,15 +381,19 @@ void checkVectorADD(T* A_h, T* B_h, T* result_H, size_t N, bool expectMatch=true } } - if (expectMatch) { - if (mismatchCount) { - failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch); + if (reportMismatch) { + if (expectMatch) { + if (mismatchCount) { + failed("%zu mismatches ; first at index:%zu\n", mismatchCount, firstMismatch); + } + } else { + if (mismatchCount == 0) { + failed("expected mismatches but did not detect any!"); + } } - } else { - if (mismatchCount == 0) { - failed("expected mismatches but did not detect any!"); - } - } + } + + return mismatchCount; } From 0318e91450f53baf62fa1fcc124fbf027b6e53c3 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 16 May 2017 21:35:40 -0500 Subject: [PATCH 090/171] changed vector types to make sure it generate proper llvm vector types Change-Id: I6c4616dae137dc4eac35e5827dc5b7f3251e0247 [ROCm/clr commit: 93fa17490091ddcd04d9dc8e6fa5f2b0fe38eddb] --- .../hipamd/include/hip/hcc_detail/hip_fp16.h | 125 +- .../include/hip/hcc_detail/hip_vector_types.h | 4067 +---------------- projects/clr/hipamd/src/hip_fp16.cpp | 442 +- projects/clr/hipamd/src/hip_hc_gfx803.ll | 147 +- 4 files changed, 266 insertions(+), 4515 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h index 0a861b64af..f1f52e4122 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h @@ -25,17 +25,6 @@ THE SOFTWARE. #include "hip/hcc_detail/hip_vector_types.h" -#if __clang_major__ > 3 - -typedef __fp16 __half; - -typedef struct __attribute__((aligned(4))){ - union { - __half p[2]; - unsigned int q; - }; -} __half2; - typedef __half half; typedef __half2 half2; @@ -214,10 +203,10 @@ __device__ __half __ushort2half_ru(unsigned short int i); __device__ __half __ushort2half_rz(unsigned short int i); __device__ __half __ushort_as_half(const unsigned short int i); -extern "C" int __hip_hc_ir_hadd2_int(int, int); -extern "C" int __hip_hc_ir_hfma2_int(int, int, int); -extern "C" int __hip_hc_ir_hmul2_int(int, int); -extern "C" int __hip_hc_ir_hsub2_int(int, int); +extern "C" __half2 __hip_hc_ir_hadd2_int(__half2, __half2); +extern "C" __half2 __hip_hc_ir_hfma2_int(__half2, __half2, __half2); +extern "C" __half2 __hip_hc_ir_hmul2_int(__half2, __half2); +extern "C" __half2 __hip_hc_ir_hsub2_int(__half2, __half2); extern "C" __half __hip_hc_ir_hceil_half(__half) __asm("llvm.ceil.f16"); extern "C" __half __hip_hc_ir_hcos_half(__half) __asm("llvm.cos.f16"); @@ -231,16 +220,16 @@ extern "C" __half __hip_hc_ir_hsin_half(__half) __asm("llvm.sin.f16"); extern "C" __half __hip_hc_ir_hsqrt_half(__half) __asm("llvm.sqrt.f16"); extern "C" __half __hip_hc_ir_htrunc_half(__half) __asm("llvm.trunc.f16"); -extern "C" int __hip_hc_ir_h2ceil_int(int); -extern "C" int __hip_hc_ir_h2cos_int(int); -extern "C" int __hip_hc_ir_h2exp2_int(int); -extern "C" int __hip_hc_ir_h2floor_int(int); -extern "C" int __hip_hc_ir_h2log2_int(int); -extern "C" int __hip_hc_ir_h2rcp_int(int); -extern "C" int __hip_hc_ir_h2rsqrt_int(int); -extern "C" int __hip_hc_ir_h2sin_int(int); -extern "C" int __hip_hc_ir_h2sqrt_int(int); -extern "C" int __hip_hc_ir_h2trunc_int(int); +extern "C" __half2 __hip_hc_ir_h2ceil_int(__half2); +extern "C" __half2 __hip_hc_ir_h2cos_int(__half2); +extern "C" __half2 __hip_hc_ir_h2exp2_int(__half2); +extern "C" __half2 __hip_hc_ir_h2floor_int(__half2); +extern "C" __half2 __hip_hc_ir_h2log2_int(__half2); +extern "C" __half2 __hip_hc_ir_h2rcp_int(__half2); +extern "C" __half2 __hip_hc_ir_h2rsqrt_int(__half2); +extern "C" __half2 __hip_hc_ir_h2sin_int(__half2); +extern "C" __half2 __hip_hc_ir_h2sqrt_int(__half2); +extern "C" __half2 __hip_hc_ir_h2trunc_int(__half2); /* Half2 Arithmetic Functions @@ -248,63 +237,63 @@ extern "C" int __hip_hc_ir_h2trunc_int(int); __device__ static inline __half2 __hadd2(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hadd2_int(a.q, b.q); + c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hadd2_int(a.q, b.q); + c.xy = __hip_hc_ir_hadd2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) { __half2 d; - d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q); + d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy); return d; } __device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) { __half2 d; - d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q); + d.xy = __hip_hc_ir_hfma2_int(a.xy, b.xy, c.xy); return d; } __device__ static inline __half2 __hmul2(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hmul2_int(a.q, b.q); + c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hmul2_int(a.q, b.q); + c.xy = __hip_hc_ir_hmul2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hsub2(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hsub2_int(a.q, b.q); + c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy); return c; } __device__ static inline __half2 __hneg2(__half2 a) { __half2 c; - c.p[0] = - a.p[0]; - c.p[1] = - a.p[1]; + c.x = - a.x; + c.y = - a.y; return c; } __device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) { __half2 c; - c.q = __hip_hc_ir_hsub2_int(a.q, b.q); + c.xy = __hip_hc_ir_hsub2_int(a.xy, b.xy); return c; } __device__ static inline __half2 h2div(__half2 a, __half2 b) { __half2 c; - c.p[0] = a.p[0] / b.p[0]; - c.p[1] = a.p[1] / b.p[1]; + c.x = a.x / b.x; + c.y = a.y / b.y; return c; } @@ -375,112 +364,94 @@ Half2 Math Operations __device__ static inline __half2 h2ceil(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2ceil_int(h.q); + a.xy = __hip_hc_ir_h2ceil_int(h.xy); return a; } __device__ static inline __half2 h2cos(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2cos_int(h.q); + a.xy = __hip_hc_ir_h2cos_int(h.xy); return a; } __device__ static inline __half2 h2exp(const __half2 h) { __half2 factor; - factor.p[0] = 1.442694; - factor.p[1] = 1.442694; - factor.q = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.q, factor.q)); + factor.x = 1.442694; + factor.y = 1.442694; + factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy)); return factor; } __device__ static inline __half2 h2exp10(const __half2 h) { __half2 factor; - factor.p[0] = 3.3219281; - factor.p[1] = 3.3219281; - factor.q = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.q, factor.q)); + factor.x = 3.3219281; + factor.y = 3.3219281; + factor.xy = __hip_hc_ir_h2exp2_int(__hip_hc_ir_hmul2_int(h.xy, factor.xy)); return factor; } __device__ static inline __half2 h2exp2(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2exp2_int(h.q); + a.xy = __hip_hc_ir_h2exp2_int(h.xy); return a; } __device__ static inline __half2 h2floor(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2floor_int(h.q); + a.xy = __hip_hc_ir_h2floor_int(h.xy); return a; } __device__ static inline __half2 h2log(const __half2 h) { __half2 factor; - factor.p[0] = 0.693147; - factor.p[1] = 0.693147; - factor. q = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.q), factor.q); + factor.x = 0.693147; + factor.y = 0.693147; + factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy); return factor; } __device__ static inline __half2 h2log10(const __half2 h) { __half2 factor; - factor.p[0] = 0.301029; - factor.p[1] = 0.301029; - factor.q = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.q), factor.q); + factor.x = 0.301029; + factor.y = 0.301029; + factor.xy = __hip_hc_ir_hmul2_int(__hip_hc_ir_h2log2_int(h.xy), factor.xy); return factor; } __device__ static inline __half2 h2log2(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2log2_int(h.q); + a.xy = __hip_hc_ir_h2log2_int(h.xy); return a; } __device__ static inline __half2 h2rcp(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2rcp_int(h.q); + a.xy = __hip_hc_ir_h2rcp_int(h.xy); return a; } __device__ static inline __half2 h2rsqrt(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2rsqrt_int(h.q); + a.xy = __hip_hc_ir_h2rsqrt_int(h.xy); return a; } __device__ static inline __half2 h2sin(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2sin_int(h.q); + a.xy = __hip_hc_ir_h2sin_int(h.xy); return a; } __device__ static inline __half2 h2sqrt(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2sqrt_int(h.q); + a.xy = __hip_hc_ir_h2sqrt_int(h.xy); return a; } __device__ static inline __half2 h2trunc(const __half2 h) { __half2 a; - a.q = __hip_hc_ir_h2trunc_int(h.q); + a.xy = __hip_hc_ir_h2trunc_int(h.xy); return a; } -#endif - -#if __clang_major__ == 3 - -typedef struct { - unsigned x: 16; -} __half; - -typedef struct __attribute__((aligned(4))){ - union { - __half p[2]; - unsigned int q; - }; -} __half2; - - -#endif - #endif diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h index 35c6c23548..251da504ab 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h @@ -34,1120 +34,93 @@ THE SOFTWARE. #include "hip/hcc_detail/host_defines.h" -#define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x) { } \ -__device__ __host__ type(const type& val) : x(val.x) { } - -#define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } - -#define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } - -#define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ -__device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } - - -#define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val) {} \ - -#define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val), y(val) {} \ -__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} - -#define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} - -#define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \ -__device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} - -struct uchar1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uchar1) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long long) - - #endif - unsigned char x; - -} __attribute__((aligned(1))); - -struct uchar2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uchar2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long long) - #endif - union { - struct { - unsigned char x, y; - }; - unsigned short a; - }; -} __attribute__((aligned(2))); - -struct uchar3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uchar3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long long) - #endif - unsigned char x, y, z; -}; - -struct uchar4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uchar4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long long) - #endif - union { - struct { - unsigned char x, y, z, w; - }; - unsigned int a; - }; -} __attribute__((aligned(4))); - - -struct char1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(char1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long long) - #endif - signed char x; -} __attribute__((aligned(1))); - -struct char2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(char2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long long) - #endif - union { - struct { - signed char x, y; - }; - unsigned short a; - }; -} __attribute__((aligned(2))); - -struct char3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(char3) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long long) - #endif - signed char x, y, z; -}; - -struct char4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(char4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long long) - #endif - union { - struct { - signed char x, y, z, w; - }; - unsigned int a; - }; -} __attribute__((aligned(4))); - - - -struct ushort1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ushort1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long long) - #endif - unsigned short x; -} __attribute__((aligned(2))); - -struct ushort2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ushort2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long long) - #endif - union { - struct { - unsigned short x, y; - }; - unsigned int a; - }; -} __attribute__((aligned(4))); - -struct ushort3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ushort3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long long) - #endif - unsigned short x, y, z; -}; - -struct ushort4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ushort4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long long) - #endif - union { - struct { - unsigned short x, y, z, w; - }; - unsigned int a, b; - }; -} __attribute__((aligned(8))); - -struct short1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(short1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long long) - #endif - signed short x; -} __attribute__((aligned(2))); - -struct short2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(short2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long long) - #endif - union { - struct { - signed short x, y; - }; - unsigned int a; - }; - -} __attribute__((aligned(4))); - -struct short3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(short3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long long) - #endif - signed short x, y, z; -}; - -struct short4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(short4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long long) - #endif - union { - struct { - signed short x, y, z, w; - }; - unsigned int a, b; - }; -} __attribute__((aligned(8))); - - -struct uint1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uint1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long long) - #endif - unsigned int x; -} __attribute__((aligned(4))); - -struct uint2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uint2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long long) - #endif - unsigned int x, y; -} __attribute__((aligned(8))); - -struct uint3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uint3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long long) - #endif - unsigned int x, y, z; -}; - -struct uint4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uint4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long long) - #endif - unsigned int x, y, z, w; -} __attribute__((aligned(16))); - -struct int1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(int1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long long) - #endif - signed int x; -} __attribute__((aligned(4))); - -struct int2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(int2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long long) - #endif - signed int x, y; -} __attribute__((aligned(8))); - -struct int3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(int3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long long) - #endif - signed int x, y, z; -}; - -struct int4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(int4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long long) - #endif - signed int x, y, z, w; -} __attribute__((aligned(16))); - - -struct float1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(float1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long long) - #endif - float x; -} __attribute__((aligned(4))); - -struct float2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(float2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long long) - #endif - float x, y; -} __attribute__((aligned(8))); - -struct float3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(float3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long long) - #endif - float x, y, z; -}; - -struct float4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(float4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long long) - #endif - float x, y, z, w; -} __attribute__((aligned(16))); - - - -struct double1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(double1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long long) - #endif - double x; -} __attribute__((aligned(8))); - -struct double2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(double2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long long) - #endif - double x, y; -} __attribute__((aligned(16))); - -struct double3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(double3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long long) - #endif - double x, y, z; -}; - -struct double4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(double4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long long) - #endif - double x, y, z, w; -} __attribute__((aligned(32))); - - -struct ulong1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulong1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long long) - #endif - unsigned long x; -} __attribute__((aligned(8))); - -struct ulong2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulong2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long long) - #endif - unsigned long x, y; -} __attribute__((aligned(16))); - -struct ulong3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulong3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long long) - #endif - unsigned long x, y, z; -}; - -struct ulong4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulong4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long long) - #endif - unsigned long x, y, z, w; -} __attribute__((aligned(32))); - - -struct long1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(long1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long long) - #endif - signed long x; -} __attribute__((aligned(8))); - -struct long2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(long2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long long) - #endif - signed long x, y; -} __attribute__((aligned(16))); - -struct long3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(long3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long long) - #endif - signed long x, y, z; -}; - -struct long4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(long4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long long) - #endif - signed long x, y, z, w; -} __attribute__((aligned(32))); - - -struct ulonglong1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long long) - #endif - unsigned long long x; -} __attribute__((aligned(8))); - -struct ulonglong2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long long) - #endif - unsigned long long x, y; -} __attribute__((aligned(16))); - -struct ulonglong3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long long) - #endif - unsigned long long x, y, z; -}; - -struct ulonglong4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long long) - #endif - unsigned long long x, y, z, w; -} __attribute__((aligned(32))); - - -struct longlong1 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(longlong1) - - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed char) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed short) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed int) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, float) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, double) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long long) - #endif - signed long long x; -} __attribute__((aligned(8))); - -struct longlong2 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(longlong2) - - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed char) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed short) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed int) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, float) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, double) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long long) - #endif - signed long long x, y; -} __attribute__((aligned(16))); - -struct longlong3 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(longlong3) - - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed char) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed short) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed int) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, float) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, double) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long long) - #endif - signed long long x, y, z; -}; - -struct longlong4 { - #ifdef __cplusplus - public: - MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(longlong4) - - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed char) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed short) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed int) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, float) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, double) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long long) - MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long long) - #endif - signed long x, y, z, w; -} __attribute__((aligned(32))); +#if __cplusplus + +typedef unsigned char uchar1 __attribute__((ext_vector_type(1))); +typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); +typedef unsigned char uchar3 __attribute__((ext_vector_type(3))); +typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); + +typedef signed char char1 __attribute__((ext_vector_type(1))); +typedef signed char char2 __attribute__((ext_vector_type(2))); +typedef signed char char3 __attribute__((ext_vector_type(3))); +typedef signed char char4 __attribute__((ext_vector_type(4))); + +typedef unsigned short ushort1 __attribute__((ext_vector_type(1))); +typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); +typedef unsigned short ushort3 __attribute__((ext_vector_type(3))); +typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); + +typedef signed short short1 __attribute__((ext_vector_type(1))); +typedef signed short short2 __attribute__((ext_vector_type(2))); +typedef signed short short3 __attribute__((ext_vector_type(3))); +typedef signed short short4 __attribute__((ext_vector_type(4))); + +typedef __fp16 __half; + +typedef __fp16 __half1 __attribute__((ext_vector_type(1))); +typedef __fp16 __half2 __attribute__((ext_vector_type(2))); +typedef __fp16 __half3 __attribute__((ext_vector_type(3))); +typedef __fp16 __half4 __attribute__((ext_vector_type(4))); + +typedef unsigned int uint1 __attribute__((ext_vector_type(1))); +typedef unsigned int uint2 __attribute__((ext_vector_type(2))); +typedef unsigned int uint3 __attribute__((ext_vector_type(3))); +typedef unsigned int uint4 __attribute__((ext_vector_type(4))); + +typedef signed int int1 __attribute__((ext_vector_type(1))); +typedef signed int int2 __attribute__((ext_vector_type(2))); +typedef signed int int3 __attribute__((ext_vector_type(3))); +typedef signed int int4 __attribute__((ext_vector_type(4))); + +typedef float float1 __attribute__((ext_vector_type(1))); +typedef float float2 __attribute__((ext_vector_type(2))); +typedef float float3 __attribute__((ext_vector_type(3))); +typedef float float4 __attribute__((ext_vector_type(4))); + +typedef unsigned long ulong1 __attribute__((ext_vector_type(1))); +typedef unsigned long ulong2 __attribute__((ext_vector_type(2))); +typedef unsigned long ulong3 __attribute__((ext_vector_type(3))); +typedef unsigned long ulong4 __attribute__((ext_vector_type(4))); + +typedef signed long long1 __attribute__((ext_vector_type(1))); +typedef signed long long2 __attribute__((ext_vector_type(2))); +typedef signed long long3 __attribute__((ext_vector_type(3))); +typedef signed long long4 __attribute__((ext_vector_type(4))); + +typedef double double1 __attribute__((ext_vector_type(1))); +typedef double double2 __attribute__((ext_vector_type(2))); +typedef double double3 __attribute__((ext_vector_type(3))); +typedef double double4 __attribute__((ext_vector_type(4))); + +typedef unsigned long long ulonglong1 __attribute__((ext_vector_type(1))); +typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); +typedef unsigned long long ulonglong3 __attribute__((ext_vector_type(3))); +typedef unsigned long long ulonglong4 __attribute__((ext_vector_type(4))); + +typedef signed long long longlong1 __attribute__((ext_vector_type(1))); +typedef signed long long longlong2 __attribute__((ext_vector_type(2))); +typedef signed long long longlong3 __attribute__((ext_vector_type(3))); +typedef signed long long longlong4 __attribute__((ext_vector_type(4))); #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x) { \ + type ret; \ ret.x = x; \ return ret; \ } #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x, comp y) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x, comp y) { \ + type ret; \ ret.x = x; \ ret.y = y; \ return ret; \ } #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x, comp y, comp z) { \ + type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -1155,8 +128,8 @@ __device__ __host__ static inline struct type make_##type(comp x, comp y, comp z } #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \ -__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z, comp w) { \ - struct type ret; \ +__device__ __host__ static inline type make_##type(comp x, comp y, comp z, comp w) { \ + type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -1164,6 +137,7 @@ __device__ __host__ static inline struct type make_##type(comp x, comp y, comp z return ret; \ } + DECLOP_MAKE_ONE_COMPONENT(unsigned char, uchar1); DECLOP_MAKE_TWO_COMPONENT(unsigned char, uchar2); DECLOP_MAKE_THREE_COMPONENT(unsigned char, uchar3); @@ -1225,2894 +199,9 @@ DECLOP_MAKE_THREE_COMPONENT(signed long, longlong3); DECLOP_MAKE_FOUR_COMPONENT(signed long, longlong4); -#if __cplusplus - -#define DECLOP_1VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - return ret; \ -} - -#define DECLOP_1VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - return ret; \ -} - -#define DECLOP_1VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - return lhs; \ -} - -#define DECLOP_1VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - return val; \ -} - -#define DECLOP_1VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - val.x op; \ - return ret; \ -} - -#define DECLOP_1VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return lhs.x op rhs.x; \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return lhs.x op rhs.x; \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return lhs.x op rhs.x ; \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return lhs.x op rhs.x ; \ -} - -#define DECLOP_1VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type& rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - return ret; \ -} - -#define DECLOP_1VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type& rhs) { \ - return op rhs.x; \ -} - -/* - Two Element Access -*/ - -#define DECLOP_2VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - ret.y = lhs.y op rhs.y; \ - return ret; \ -} - -#define DECLOP_2VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - ret.y = lhs.y * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - ret.y = lhs * rhs.y; \ - return ret; \ -} - -#define DECLOP_2VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - lhs.y op rhs.y; \ - return lhs; \ -} - -#define DECLOP_2VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - op val.y; \ - return val; \ -} - -#define DECLOP_2VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - ret.y = val.y; \ - val.x op; \ - val.y op; \ - return ret; \ -} - -#define DECLOP_2VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ -} - -#define DECLOP_2VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type &rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - ret.y = op rhs.y; \ - return ret; \ -} - -#define DECLOP_2VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type &rhs) { \ - return (op rhs.x) && (op rhs.y); \ -} - - -/* - Three Element Access -*/ - -#define DECLOP_3VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - ret.y = lhs.y op rhs.y; \ - ret.z = lhs.z op rhs.z; \ - return ret; \ -} - -#define DECLOP_3VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - ret.y = lhs.y * rhs; \ - ret.z = lhs.z * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - ret.y = lhs * rhs.y; \ - ret.z = lhs * rhs.z; \ - return ret; \ -} - -#define DECLOP_3VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - lhs.y op rhs.y; \ - lhs.z op rhs.z; \ - return lhs; \ -} - -#define DECLOP_3VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - op val.y; \ - op val.z; \ - return val; \ -} - -#define DECLOP_3VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - ret.y = val.y; \ - ret.z = val.z; \ - val.x op; \ - val.y op; \ - val.z op; \ - return ret; \ -} - -#define DECLOP_3VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ -} \ - -#define DECLOP_3VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type &rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - ret.y = op rhs.y; \ - ret.z = op rhs.z; \ - return ret; \ -} - -#define DECLOP_3VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type &rhs) { \ - return (op rhs.x) && (op rhs.y) && (op rhs.z); \ -} - - -/* - Four Element Access -*/ - -#define DECLOP_4VAR_2IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op ( const type& lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs.x op rhs.x; \ - ret.y = lhs.y op rhs.y; \ - ret.z = lhs.z op rhs.z; \ - ret.w = lhs.w op rhs.w; \ - return ret; \ -} - -#define DECLOP_4VAR_SCALE_PRODUCT(type, type1) \ -__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ - type ret; \ - ret.x = lhs.x * rhs; \ - ret.y = lhs.y * rhs; \ - ret.z = lhs.z * rhs; \ - ret.w = lhs.w * rhs; \ - return ret; \ -} \ -\ -__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ - type ret; \ - ret.x = lhs * rhs.x; \ - ret.y = lhs * rhs.y; \ - ret.z = lhs * rhs.z; \ - ret.w = lhs * rhs.w; \ - return ret; \ -} - -#define DECLOP_4VAR_ASSIGN(type, op) \ -__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ - lhs.x op rhs.x; \ - lhs.y op rhs.y; \ - lhs.z op rhs.z; \ - lhs.w op rhs.w; \ - return lhs; \ -} - -#define DECLOP_4VAR_PREOP(type, op) \ -__device__ __host__ static inline type& operator op (type& val) { \ - op val.x; \ - op val.y; \ - op val.z; \ - op val.w; \ - return val; \ -} - -#define DECLOP_4VAR_POSTOP(type, op) \ -__device__ __host__ static inline type operator op (type& val, int) { \ - type ret; \ - ret.x = val.x; \ - ret.y = val.y; \ - ret.z = val.z; \ - ret.w = val.w; \ - val.x op; \ - val.y op; \ - val.z op; \ - val.w op; \ - return ret; \ -} - -#define DECLOP_4VAR_COMP(type, op) \ -__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} \ -__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} \ -__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ - return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ -} - -#define DECLOP_4VAR_1IN_1OUT(type, op) \ -__device__ __host__ static inline type operator op(type &rhs) { \ - type ret; \ - ret.x = op rhs.x; \ - ret.y = op rhs.y; \ - ret.z = op rhs.z; \ - ret.w = op rhs.w; \ - return ret; \ -} - -#define DECLOP_4VAR_1IN_BOOLOUT(type, op) \ -__device__ __host__ static inline bool operator op (type &rhs) { \ - return (op rhs.x) && (op rhs.y) && (op rhs.z) && (op rhs.w); \ -} - - -/* -Overloading operators -*/ - -// UNSIGNED CHAR1 - -DECLOP_1VAR_2IN_1OUT(uchar1, +) -DECLOP_1VAR_2IN_1OUT(uchar1, -) -DECLOP_1VAR_2IN_1OUT(uchar1, *) -DECLOP_1VAR_2IN_1OUT(uchar1, /) -DECLOP_1VAR_2IN_1OUT(uchar1, %) -DECLOP_1VAR_2IN_1OUT(uchar1, &) -DECLOP_1VAR_2IN_1OUT(uchar1, |) -DECLOP_1VAR_2IN_1OUT(uchar1, ^) -DECLOP_1VAR_2IN_1OUT(uchar1, <<) -DECLOP_1VAR_2IN_1OUT(uchar1, >>) - - -DECLOP_1VAR_ASSIGN(uchar1, +=) -DECLOP_1VAR_ASSIGN(uchar1, -=) -DECLOP_1VAR_ASSIGN(uchar1, *=) -DECLOP_1VAR_ASSIGN(uchar1, /=) -DECLOP_1VAR_ASSIGN(uchar1, %=) -DECLOP_1VAR_ASSIGN(uchar1, &=) -DECLOP_1VAR_ASSIGN(uchar1, |=) -DECLOP_1VAR_ASSIGN(uchar1, ^=) -DECLOP_1VAR_ASSIGN(uchar1, <<=) -DECLOP_1VAR_ASSIGN(uchar1, >>=) - -DECLOP_1VAR_PREOP(uchar1, ++) -DECLOP_1VAR_PREOP(uchar1, --) - -DECLOP_1VAR_POSTOP(uchar1, ++) -DECLOP_1VAR_POSTOP(uchar1, --) - -DECLOP_1VAR_COMP(uchar1, ==) -DECLOP_1VAR_COMP(uchar1, !=) -DECLOP_1VAR_COMP(uchar1, <) -DECLOP_1VAR_COMP(uchar1, >) -DECLOP_1VAR_COMP(uchar1, <=) -DECLOP_1VAR_COMP(uchar1, >=) - -DECLOP_1VAR_COMP(uchar1, &&) -DECLOP_1VAR_COMP(uchar1, ||) - -DECLOP_1VAR_1IN_1OUT(uchar1, ~) -DECLOP_1VAR_1IN_BOOLOUT(uchar1, !) - -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, float) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, double) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long long) - -// UNSIGNED CHAR2 - -DECLOP_2VAR_2IN_1OUT(uchar2, +) -DECLOP_2VAR_2IN_1OUT(uchar2, -) -DECLOP_2VAR_2IN_1OUT(uchar2, *) -DECLOP_2VAR_2IN_1OUT(uchar2, /) -DECLOP_2VAR_2IN_1OUT(uchar2, %) -DECLOP_2VAR_2IN_1OUT(uchar2, &) -DECLOP_2VAR_2IN_1OUT(uchar2, |) -DECLOP_2VAR_2IN_1OUT(uchar2, ^) -DECLOP_2VAR_2IN_1OUT(uchar2, <<) -DECLOP_2VAR_2IN_1OUT(uchar2, >>) - -DECLOP_2VAR_ASSIGN(uchar2, +=) -DECLOP_2VAR_ASSIGN(uchar2, -=) -DECLOP_2VAR_ASSIGN(uchar2, *=) -DECLOP_2VAR_ASSIGN(uchar2, /=) -DECLOP_2VAR_ASSIGN(uchar2, %=) -DECLOP_2VAR_ASSIGN(uchar2, &=) -DECLOP_2VAR_ASSIGN(uchar2, |=) -DECLOP_2VAR_ASSIGN(uchar2, ^=) -DECLOP_2VAR_ASSIGN(uchar2, <<=) -DECLOP_2VAR_ASSIGN(uchar2, >>=) - -DECLOP_2VAR_PREOP(uchar2, ++) -DECLOP_2VAR_PREOP(uchar2, --) - -DECLOP_2VAR_POSTOP(uchar2, ++) -DECLOP_2VAR_POSTOP(uchar2, --) - -DECLOP_2VAR_COMP(uchar2, ==) -DECLOP_2VAR_COMP(uchar2, !=) -DECLOP_2VAR_COMP(uchar2, <) -DECLOP_2VAR_COMP(uchar2, >) -DECLOP_2VAR_COMP(uchar2, <=) -DECLOP_2VAR_COMP(uchar2, >=) - -DECLOP_2VAR_COMP(uchar2, &&) -DECLOP_2VAR_COMP(uchar2, ||) - -DECLOP_2VAR_1IN_1OUT(uchar2, ~) -DECLOP_2VAR_1IN_BOOLOUT(uchar2, !) - -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, float) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, double) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long long) - -// UNSIGNED CHAR3 - -DECLOP_3VAR_2IN_1OUT(uchar3, +) -DECLOP_3VAR_2IN_1OUT(uchar3, -) -DECLOP_3VAR_2IN_1OUT(uchar3, *) -DECLOP_3VAR_2IN_1OUT(uchar3, /) -DECLOP_3VAR_2IN_1OUT(uchar3, %) -DECLOP_3VAR_2IN_1OUT(uchar3, &) -DECLOP_3VAR_2IN_1OUT(uchar3, |) -DECLOP_3VAR_2IN_1OUT(uchar3, ^) -DECLOP_3VAR_2IN_1OUT(uchar3, <<) -DECLOP_3VAR_2IN_1OUT(uchar3, >>) - -DECLOP_3VAR_ASSIGN(uchar3, +=) -DECLOP_3VAR_ASSIGN(uchar3, -=) -DECLOP_3VAR_ASSIGN(uchar3, *=) -DECLOP_3VAR_ASSIGN(uchar3, /=) -DECLOP_3VAR_ASSIGN(uchar3, %=) -DECLOP_3VAR_ASSIGN(uchar3, &=) -DECLOP_3VAR_ASSIGN(uchar3, |=) -DECLOP_3VAR_ASSIGN(uchar3, ^=) -DECLOP_3VAR_ASSIGN(uchar3, <<=) -DECLOP_3VAR_ASSIGN(uchar3, >>=) - -DECLOP_3VAR_PREOP(uchar3, ++) -DECLOP_3VAR_PREOP(uchar3, --) - -DECLOP_3VAR_POSTOP(uchar3, ++) -DECLOP_3VAR_POSTOP(uchar3, --) - -DECLOP_3VAR_COMP(uchar3, ==) -DECLOP_3VAR_COMP(uchar3, !=) -DECLOP_3VAR_COMP(uchar3, <) -DECLOP_3VAR_COMP(uchar3, >) -DECLOP_3VAR_COMP(uchar3, <=) -DECLOP_3VAR_COMP(uchar3, >=) - -DECLOP_3VAR_COMP(uchar3, &&) -DECLOP_3VAR_COMP(uchar3, ||) - -DECLOP_3VAR_1IN_1OUT(uchar3, ~) -DECLOP_3VAR_1IN_BOOLOUT(uchar3, !) - -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, float) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, double) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long long) - -// UNSIGNED CHAR4 - -DECLOP_4VAR_2IN_1OUT(uchar4, +) -DECLOP_4VAR_2IN_1OUT(uchar4, -) -DECLOP_4VAR_2IN_1OUT(uchar4, *) -DECLOP_4VAR_2IN_1OUT(uchar4, /) -DECLOP_4VAR_2IN_1OUT(uchar4, %) -DECLOP_4VAR_2IN_1OUT(uchar4, &) -DECLOP_4VAR_2IN_1OUT(uchar4, |) -DECLOP_4VAR_2IN_1OUT(uchar4, ^) -DECLOP_4VAR_2IN_1OUT(uchar4, <<) -DECLOP_4VAR_2IN_1OUT(uchar4, >>) - -DECLOP_4VAR_ASSIGN(uchar4, +=) -DECLOP_4VAR_ASSIGN(uchar4, -=) -DECLOP_4VAR_ASSIGN(uchar4, *=) -DECLOP_4VAR_ASSIGN(uchar4, /=) -DECLOP_4VAR_ASSIGN(uchar4, %=) -DECLOP_4VAR_ASSIGN(uchar4, &=) -DECLOP_4VAR_ASSIGN(uchar4, |=) -DECLOP_4VAR_ASSIGN(uchar4, ^=) -DECLOP_4VAR_ASSIGN(uchar4, <<=) -DECLOP_4VAR_ASSIGN(uchar4, >>=) - -DECLOP_4VAR_PREOP(uchar4, ++) -DECLOP_4VAR_PREOP(uchar4, --) - -DECLOP_4VAR_POSTOP(uchar4, ++) -DECLOP_4VAR_POSTOP(uchar4, --) - -DECLOP_4VAR_COMP(uchar4, ==) -DECLOP_4VAR_COMP(uchar4, !=) -DECLOP_4VAR_COMP(uchar4, <) -DECLOP_4VAR_COMP(uchar4, >) -DECLOP_4VAR_COMP(uchar4, <=) -DECLOP_4VAR_COMP(uchar4, >=) - -DECLOP_4VAR_COMP(uchar4, &&) -DECLOP_4VAR_COMP(uchar4, ||) - -DECLOP_4VAR_1IN_1OUT(uchar4, ~) -DECLOP_4VAR_1IN_BOOLOUT(uchar4, !) - -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, float) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, double) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long long) - -// SIGNED CHAR1 - -DECLOP_1VAR_2IN_1OUT(char1, +) -DECLOP_1VAR_2IN_1OUT(char1, -) -DECLOP_1VAR_2IN_1OUT(char1, *) -DECLOP_1VAR_2IN_1OUT(char1, /) -DECLOP_1VAR_2IN_1OUT(char1, %) -DECLOP_1VAR_2IN_1OUT(char1, &) -DECLOP_1VAR_2IN_1OUT(char1, |) -DECLOP_1VAR_2IN_1OUT(char1, ^) -DECLOP_1VAR_2IN_1OUT(char1, <<) -DECLOP_1VAR_2IN_1OUT(char1, >>) - - -DECLOP_1VAR_ASSIGN(char1, +=) -DECLOP_1VAR_ASSIGN(char1, -=) -DECLOP_1VAR_ASSIGN(char1, *=) -DECLOP_1VAR_ASSIGN(char1, /=) -DECLOP_1VAR_ASSIGN(char1, %=) -DECLOP_1VAR_ASSIGN(char1, &=) -DECLOP_1VAR_ASSIGN(char1, |=) -DECLOP_1VAR_ASSIGN(char1, ^=) -DECLOP_1VAR_ASSIGN(char1, <<=) -DECLOP_1VAR_ASSIGN(char1, >>=) - -DECLOP_1VAR_PREOP(char1, ++) -DECLOP_1VAR_PREOP(char1, --) - -DECLOP_1VAR_POSTOP(char1, ++) -DECLOP_1VAR_POSTOP(char1, --) - -DECLOP_1VAR_COMP(char1, ==) -DECLOP_1VAR_COMP(char1, !=) -DECLOP_1VAR_COMP(char1, <) -DECLOP_1VAR_COMP(char1, >) -DECLOP_1VAR_COMP(char1, <=) -DECLOP_1VAR_COMP(char1, >=) - -DECLOP_1VAR_COMP(char1, &&) -DECLOP_1VAR_COMP(char1, ||) - -DECLOP_1VAR_1IN_1OUT(char1, ~) -DECLOP_1VAR_1IN_BOOLOUT(char1, !) - -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(char1, float) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(char1, double) -DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(char1, signed long long) - -// SIGNED CHAR2 - -DECLOP_2VAR_2IN_1OUT(char2, +) -DECLOP_2VAR_2IN_1OUT(char2, -) -DECLOP_2VAR_2IN_1OUT(char2, *) -DECLOP_2VAR_2IN_1OUT(char2, /) -DECLOP_2VAR_2IN_1OUT(char2, %) -DECLOP_2VAR_2IN_1OUT(char2, &) -DECLOP_2VAR_2IN_1OUT(char2, |) -DECLOP_2VAR_2IN_1OUT(char2, ^) -DECLOP_2VAR_2IN_1OUT(char2, <<) -DECLOP_2VAR_2IN_1OUT(char2, >>) - -DECLOP_2VAR_ASSIGN(char2, +=) -DECLOP_2VAR_ASSIGN(char2, -=) -DECLOP_2VAR_ASSIGN(char2, *=) -DECLOP_2VAR_ASSIGN(char2, /=) -DECLOP_2VAR_ASSIGN(char2, %=) -DECLOP_2VAR_ASSIGN(char2, &=) -DECLOP_2VAR_ASSIGN(char2, |=) -DECLOP_2VAR_ASSIGN(char2, ^=) -DECLOP_2VAR_ASSIGN(char2, <<=) -DECLOP_2VAR_ASSIGN(char2, >>=) - -DECLOP_2VAR_PREOP(char2, ++) -DECLOP_2VAR_PREOP(char2, --) - -DECLOP_2VAR_POSTOP(char2, ++) -DECLOP_2VAR_POSTOP(char2, --) - -DECLOP_2VAR_COMP(char2, ==) -DECLOP_2VAR_COMP(char2, !=) -DECLOP_2VAR_COMP(char2, <) -DECLOP_2VAR_COMP(char2, >) -DECLOP_2VAR_COMP(char2, <=) -DECLOP_2VAR_COMP(char2, >=) - -DECLOP_2VAR_COMP(char2, &&) -DECLOP_2VAR_COMP(char2, ||) - -DECLOP_2VAR_1IN_1OUT(char2, ~) -DECLOP_2VAR_1IN_BOOLOUT(char2, !) - -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(char2, float) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(char2, double) -DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(char2, signed long long) - -// SIGNED CHAR3 - -DECLOP_3VAR_2IN_1OUT(char3, +) -DECLOP_3VAR_2IN_1OUT(char3, -) -DECLOP_3VAR_2IN_1OUT(char3, *) -DECLOP_3VAR_2IN_1OUT(char3, /) -DECLOP_3VAR_2IN_1OUT(char3, %) -DECLOP_3VAR_2IN_1OUT(char3, &) -DECLOP_3VAR_2IN_1OUT(char3, |) -DECLOP_3VAR_2IN_1OUT(char3, ^) -DECLOP_3VAR_2IN_1OUT(char3, <<) -DECLOP_3VAR_2IN_1OUT(char3, >>) - -DECLOP_3VAR_ASSIGN(char3, +=) -DECLOP_3VAR_ASSIGN(char3, -=) -DECLOP_3VAR_ASSIGN(char3, *=) -DECLOP_3VAR_ASSIGN(char3, /=) -DECLOP_3VAR_ASSIGN(char3, %=) -DECLOP_3VAR_ASSIGN(char3, &=) -DECLOP_3VAR_ASSIGN(char3, |=) -DECLOP_3VAR_ASSIGN(char3, ^=) -DECLOP_3VAR_ASSIGN(char3, <<=) -DECLOP_3VAR_ASSIGN(char3, >>=) - -DECLOP_3VAR_PREOP(char3, ++) -DECLOP_3VAR_PREOP(char3, --) - -DECLOP_3VAR_POSTOP(char3, ++) -DECLOP_3VAR_POSTOP(char3, --) - -DECLOP_3VAR_COMP(char3, ==) -DECLOP_3VAR_COMP(char3, !=) -DECLOP_3VAR_COMP(char3, <) -DECLOP_3VAR_COMP(char3, >) -DECLOP_3VAR_COMP(char3, <=) -DECLOP_3VAR_COMP(char3, >=) - -DECLOP_3VAR_COMP(char3, &&) -DECLOP_3VAR_COMP(char3, ||) - -DECLOP_3VAR_1IN_1OUT(char3, ~) -DECLOP_3VAR_1IN_BOOLOUT(char3, !) - -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(char3, float) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(char3, double) -DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(char3, signed long long) - -// SIGNED CHAR4 - -DECLOP_4VAR_2IN_1OUT(char4, +) -DECLOP_4VAR_2IN_1OUT(char4, -) -DECLOP_4VAR_2IN_1OUT(char4, *) -DECLOP_4VAR_2IN_1OUT(char4, /) -DECLOP_4VAR_2IN_1OUT(char4, %) -DECLOP_4VAR_2IN_1OUT(char4, &) -DECLOP_4VAR_2IN_1OUT(char4, |) -DECLOP_4VAR_2IN_1OUT(char4, ^) -DECLOP_4VAR_2IN_1OUT(char4, <<) -DECLOP_4VAR_2IN_1OUT(char4, >>) - -DECLOP_4VAR_ASSIGN(char4, +=) -DECLOP_4VAR_ASSIGN(char4, -=) -DECLOP_4VAR_ASSIGN(char4, *=) -DECLOP_4VAR_ASSIGN(char4, /=) -DECLOP_4VAR_ASSIGN(char4, %=) -DECLOP_4VAR_ASSIGN(char4, &=) -DECLOP_4VAR_ASSIGN(char4, |=) -DECLOP_4VAR_ASSIGN(char4, ^=) -DECLOP_4VAR_ASSIGN(char4, <<=) -DECLOP_4VAR_ASSIGN(char4, >>=) - -DECLOP_4VAR_PREOP(char4, ++) -DECLOP_4VAR_PREOP(char4, --) - -DECLOP_4VAR_POSTOP(char4, ++) -DECLOP_4VAR_POSTOP(char4, --) - -DECLOP_4VAR_COMP(char4, ==) -DECLOP_4VAR_COMP(char4, !=) -DECLOP_4VAR_COMP(char4, <) -DECLOP_4VAR_COMP(char4, >) -DECLOP_4VAR_COMP(char4, <=) -DECLOP_4VAR_COMP(char4, >=) - -DECLOP_4VAR_COMP(char4, &&) -DECLOP_4VAR_COMP(char4, ||) - -DECLOP_4VAR_1IN_1OUT(char4, ~) -DECLOP_4VAR_1IN_BOOLOUT(char4, !) - -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(char4, float) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(char4, double) -DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(char4, signed long long) - -// UNSIGNED SHORT1 - -DECLOP_1VAR_2IN_1OUT(ushort1, +) -DECLOP_1VAR_2IN_1OUT(ushort1, -) -DECLOP_1VAR_2IN_1OUT(ushort1, *) -DECLOP_1VAR_2IN_1OUT(ushort1, /) -DECLOP_1VAR_2IN_1OUT(ushort1, %) -DECLOP_1VAR_2IN_1OUT(ushort1, &) -DECLOP_1VAR_2IN_1OUT(ushort1, |) -DECLOP_1VAR_2IN_1OUT(ushort1, ^) -DECLOP_1VAR_2IN_1OUT(ushort1, <<) -DECLOP_1VAR_2IN_1OUT(ushort1, >>) - - -DECLOP_1VAR_ASSIGN(ushort1, +=) -DECLOP_1VAR_ASSIGN(ushort1, -=) -DECLOP_1VAR_ASSIGN(ushort1, *=) -DECLOP_1VAR_ASSIGN(ushort1, /=) -DECLOP_1VAR_ASSIGN(ushort1, %=) -DECLOP_1VAR_ASSIGN(ushort1, &=) -DECLOP_1VAR_ASSIGN(ushort1, |=) -DECLOP_1VAR_ASSIGN(ushort1, ^=) -DECLOP_1VAR_ASSIGN(ushort1, <<=) -DECLOP_1VAR_ASSIGN(ushort1, >>=) - -DECLOP_1VAR_PREOP(ushort1, ++) -DECLOP_1VAR_PREOP(ushort1, --) - -DECLOP_1VAR_POSTOP(ushort1, ++) -DECLOP_1VAR_POSTOP(ushort1, --) - -DECLOP_1VAR_COMP(ushort1, ==) -DECLOP_1VAR_COMP(ushort1, !=) -DECLOP_1VAR_COMP(ushort1, <) -DECLOP_1VAR_COMP(ushort1, >) -DECLOP_1VAR_COMP(ushort1, <=) -DECLOP_1VAR_COMP(ushort1, >=) - -DECLOP_1VAR_COMP(ushort1, &&) -DECLOP_1VAR_COMP(ushort1, ||) - -DECLOP_1VAR_1IN_1OUT(ushort1, ~) -DECLOP_1VAR_1IN_BOOLOUT(ushort1, !) - -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, float) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, double) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long long) - -// UNSIGNED SHORT2 - -DECLOP_2VAR_2IN_1OUT(ushort2, +) -DECLOP_2VAR_2IN_1OUT(ushort2, -) -DECLOP_2VAR_2IN_1OUT(ushort2, *) -DECLOP_2VAR_2IN_1OUT(ushort2, /) -DECLOP_2VAR_2IN_1OUT(ushort2, %) -DECLOP_2VAR_2IN_1OUT(ushort2, &) -DECLOP_2VAR_2IN_1OUT(ushort2, |) -DECLOP_2VAR_2IN_1OUT(ushort2, ^) -DECLOP_2VAR_2IN_1OUT(ushort2, <<) -DECLOP_2VAR_2IN_1OUT(ushort2, >>) - -DECLOP_2VAR_ASSIGN(ushort2, +=) -DECLOP_2VAR_ASSIGN(ushort2, -=) -DECLOP_2VAR_ASSIGN(ushort2, *=) -DECLOP_2VAR_ASSIGN(ushort2, /=) -DECLOP_2VAR_ASSIGN(ushort2, %=) -DECLOP_2VAR_ASSIGN(ushort2, &=) -DECLOP_2VAR_ASSIGN(ushort2, |=) -DECLOP_2VAR_ASSIGN(ushort2, ^=) -DECLOP_2VAR_ASSIGN(ushort2, <<=) -DECLOP_2VAR_ASSIGN(ushort2, >>=) - -DECLOP_2VAR_PREOP(ushort2, ++) -DECLOP_2VAR_PREOP(ushort2, --) - -DECLOP_2VAR_POSTOP(ushort2, ++) -DECLOP_2VAR_POSTOP(ushort2, --) - -DECLOP_2VAR_COMP(ushort2, ==) -DECLOP_2VAR_COMP(ushort2, !=) -DECLOP_2VAR_COMP(ushort2, <) -DECLOP_2VAR_COMP(ushort2, >) -DECLOP_2VAR_COMP(ushort2, <=) -DECLOP_2VAR_COMP(ushort2, >=) - -DECLOP_2VAR_COMP(ushort2, &&) -DECLOP_2VAR_COMP(ushort2, ||) - -DECLOP_2VAR_1IN_1OUT(ushort2, ~) -DECLOP_2VAR_1IN_BOOLOUT(ushort2, !) - -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, float) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, double) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long long) - -// UNSIGNED SHORT3 - -DECLOP_3VAR_2IN_1OUT(ushort3, +) -DECLOP_3VAR_2IN_1OUT(ushort3, -) -DECLOP_3VAR_2IN_1OUT(ushort3, *) -DECLOP_3VAR_2IN_1OUT(ushort3, /) -DECLOP_3VAR_2IN_1OUT(ushort3, %) -DECLOP_3VAR_2IN_1OUT(ushort3, &) -DECLOP_3VAR_2IN_1OUT(ushort3, |) -DECLOP_3VAR_2IN_1OUT(ushort3, ^) -DECLOP_3VAR_2IN_1OUT(ushort3, <<) -DECLOP_3VAR_2IN_1OUT(ushort3, >>) - -DECLOP_3VAR_ASSIGN(ushort3, +=) -DECLOP_3VAR_ASSIGN(ushort3, -=) -DECLOP_3VAR_ASSIGN(ushort3, *=) -DECLOP_3VAR_ASSIGN(ushort3, /=) -DECLOP_3VAR_ASSIGN(ushort3, %=) -DECLOP_3VAR_ASSIGN(ushort3, &=) -DECLOP_3VAR_ASSIGN(ushort3, |=) -DECLOP_3VAR_ASSIGN(ushort3, ^=) -DECLOP_3VAR_ASSIGN(ushort3, <<=) -DECLOP_3VAR_ASSIGN(ushort3, >>=) - -DECLOP_3VAR_PREOP(ushort3, ++) -DECLOP_3VAR_PREOP(ushort3, --) - -DECLOP_3VAR_POSTOP(ushort3, ++) -DECLOP_3VAR_POSTOP(ushort3, --) - -DECLOP_3VAR_COMP(ushort3, ==) -DECLOP_3VAR_COMP(ushort3, !=) -DECLOP_3VAR_COMP(ushort3, <) -DECLOP_3VAR_COMP(ushort3, >) -DECLOP_3VAR_COMP(ushort3, <=) -DECLOP_3VAR_COMP(ushort3, >=) - -DECLOP_3VAR_COMP(ushort3, &&) -DECLOP_3VAR_COMP(ushort3, ||) - -DECLOP_3VAR_1IN_1OUT(ushort3, ~) -DECLOP_3VAR_1IN_BOOLOUT(ushort3, !) - -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, float) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, double) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long long) - -// UNSIGNED SHORT4 - -DECLOP_4VAR_2IN_1OUT(ushort4, +) -DECLOP_4VAR_2IN_1OUT(ushort4, -) -DECLOP_4VAR_2IN_1OUT(ushort4, *) -DECLOP_4VAR_2IN_1OUT(ushort4, /) -DECLOP_4VAR_2IN_1OUT(ushort4, %) -DECLOP_4VAR_2IN_1OUT(ushort4, &) -DECLOP_4VAR_2IN_1OUT(ushort4, |) -DECLOP_4VAR_2IN_1OUT(ushort4, ^) -DECLOP_4VAR_2IN_1OUT(ushort4, <<) -DECLOP_4VAR_2IN_1OUT(ushort4, >>) - -DECLOP_4VAR_ASSIGN(ushort4, +=) -DECLOP_4VAR_ASSIGN(ushort4, -=) -DECLOP_4VAR_ASSIGN(ushort4, *=) -DECLOP_4VAR_ASSIGN(ushort4, /=) -DECLOP_4VAR_ASSIGN(ushort4, %=) -DECLOP_4VAR_ASSIGN(ushort4, &=) -DECLOP_4VAR_ASSIGN(ushort4, |=) -DECLOP_4VAR_ASSIGN(ushort4, ^=) -DECLOP_4VAR_ASSIGN(ushort4, <<=) -DECLOP_4VAR_ASSIGN(ushort4, >>=) - -DECLOP_4VAR_PREOP(ushort4, ++) -DECLOP_4VAR_PREOP(ushort4, --) - -DECLOP_4VAR_POSTOP(ushort4, ++) -DECLOP_4VAR_POSTOP(ushort4, --) - -DECLOP_4VAR_COMP(ushort4, ==) -DECLOP_4VAR_COMP(ushort4, !=) -DECLOP_4VAR_COMP(ushort4, <) -DECLOP_4VAR_COMP(ushort4, >) -DECLOP_4VAR_COMP(ushort4, <=) -DECLOP_4VAR_COMP(ushort4, >=) - -DECLOP_4VAR_COMP(ushort4, &&) -DECLOP_4VAR_COMP(ushort4, ||) - -DECLOP_4VAR_1IN_1OUT(ushort4, ~) -DECLOP_4VAR_1IN_BOOLOUT(ushort4, !) - -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, float) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, double) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long long) - -// SIGNED SHORT1 - -DECLOP_1VAR_2IN_1OUT(short1, +) -DECLOP_1VAR_2IN_1OUT(short1, -) -DECLOP_1VAR_2IN_1OUT(short1, *) -DECLOP_1VAR_2IN_1OUT(short1, /) -DECLOP_1VAR_2IN_1OUT(short1, %) -DECLOP_1VAR_2IN_1OUT(short1, &) -DECLOP_1VAR_2IN_1OUT(short1, |) -DECLOP_1VAR_2IN_1OUT(short1, ^) -DECLOP_1VAR_2IN_1OUT(short1, <<) -DECLOP_1VAR_2IN_1OUT(short1, >>) - - -DECLOP_1VAR_ASSIGN(short1, +=) -DECLOP_1VAR_ASSIGN(short1, -=) -DECLOP_1VAR_ASSIGN(short1, *=) -DECLOP_1VAR_ASSIGN(short1, /=) -DECLOP_1VAR_ASSIGN(short1, %=) -DECLOP_1VAR_ASSIGN(short1, &=) -DECLOP_1VAR_ASSIGN(short1, |=) -DECLOP_1VAR_ASSIGN(short1, ^=) -DECLOP_1VAR_ASSIGN(short1, <<=) -DECLOP_1VAR_ASSIGN(short1, >>=) - -DECLOP_1VAR_PREOP(short1, ++) -DECLOP_1VAR_PREOP(short1, --) - -DECLOP_1VAR_POSTOP(short1, ++) -DECLOP_1VAR_POSTOP(short1, --) - -DECLOP_1VAR_COMP(short1, ==) -DECLOP_1VAR_COMP(short1, !=) -DECLOP_1VAR_COMP(short1, <) -DECLOP_1VAR_COMP(short1, >) -DECLOP_1VAR_COMP(short1, <=) -DECLOP_1VAR_COMP(short1, >=) - -DECLOP_1VAR_COMP(short1, &&) -DECLOP_1VAR_COMP(short1, ||) - -DECLOP_1VAR_1IN_1OUT(short1, ~) -DECLOP_1VAR_1IN_BOOLOUT(short1, !) - -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(short1, float) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(short1, double) -DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(short1, signed long long) - -// SIGNED SHORT2 - -DECLOP_2VAR_2IN_1OUT(short2, +) -DECLOP_2VAR_2IN_1OUT(short2, -) -DECLOP_2VAR_2IN_1OUT(short2, *) -DECLOP_2VAR_2IN_1OUT(short2, /) -DECLOP_2VAR_2IN_1OUT(short2, %) -DECLOP_2VAR_2IN_1OUT(short2, &) -DECLOP_2VAR_2IN_1OUT(short2, |) -DECLOP_2VAR_2IN_1OUT(short2, ^) -DECLOP_2VAR_2IN_1OUT(short2, <<) -DECLOP_2VAR_2IN_1OUT(short2, >>) - -DECLOP_2VAR_ASSIGN(short2, +=) -DECLOP_2VAR_ASSIGN(short2, -=) -DECLOP_2VAR_ASSIGN(short2, *=) -DECLOP_2VAR_ASSIGN(short2, /=) -DECLOP_2VAR_ASSIGN(short2, %=) -DECLOP_2VAR_ASSIGN(short2, &=) -DECLOP_2VAR_ASSIGN(short2, |=) -DECLOP_2VAR_ASSIGN(short2, ^=) -DECLOP_2VAR_ASSIGN(short2, <<=) -DECLOP_2VAR_ASSIGN(short2, >>=) - -DECLOP_2VAR_PREOP(short2, ++) -DECLOP_2VAR_PREOP(short2, --) - -DECLOP_2VAR_POSTOP(short2, ++) -DECLOP_2VAR_POSTOP(short2, --) - -DECLOP_2VAR_COMP(short2, ==) -DECLOP_2VAR_COMP(short2, !=) -DECLOP_2VAR_COMP(short2, <) -DECLOP_2VAR_COMP(short2, >) -DECLOP_2VAR_COMP(short2, <=) -DECLOP_2VAR_COMP(short2, >=) - -DECLOP_2VAR_COMP(short2, &&) -DECLOP_2VAR_COMP(short2, ||) - -DECLOP_2VAR_1IN_1OUT(short2, ~) -DECLOP_2VAR_1IN_BOOLOUT(short2, !) - -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(short2, float) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(short2, double) -DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(short2, signed long long) - -// SIGNED SHORT3 - -DECLOP_3VAR_2IN_1OUT(short3, +) -DECLOP_3VAR_2IN_1OUT(short3, -) -DECLOP_3VAR_2IN_1OUT(short3, *) -DECLOP_3VAR_2IN_1OUT(short3, /) -DECLOP_3VAR_2IN_1OUT(short3, %) -DECLOP_3VAR_2IN_1OUT(short3, &) -DECLOP_3VAR_2IN_1OUT(short3, |) -DECLOP_3VAR_2IN_1OUT(short3, ^) -DECLOP_3VAR_2IN_1OUT(short3, <<) -DECLOP_3VAR_2IN_1OUT(short3, >>) - -DECLOP_3VAR_ASSIGN(short3, +=) -DECLOP_3VAR_ASSIGN(short3, -=) -DECLOP_3VAR_ASSIGN(short3, *=) -DECLOP_3VAR_ASSIGN(short3, /=) -DECLOP_3VAR_ASSIGN(short3, %=) -DECLOP_3VAR_ASSIGN(short3, &=) -DECLOP_3VAR_ASSIGN(short3, |=) -DECLOP_3VAR_ASSIGN(short3, ^=) -DECLOP_3VAR_ASSIGN(short3, <<=) -DECLOP_3VAR_ASSIGN(short3, >>=) - -DECLOP_3VAR_PREOP(short3, ++) -DECLOP_3VAR_PREOP(short3, --) - -DECLOP_3VAR_POSTOP(short3, ++) -DECLOP_3VAR_POSTOP(short3, --) - -DECLOP_3VAR_COMP(short3, ==) -DECLOP_3VAR_COMP(short3, !=) -DECLOP_3VAR_COMP(short3, <) -DECLOP_3VAR_COMP(short3, >) -DECLOP_3VAR_COMP(short3, <=) -DECLOP_3VAR_COMP(short3, >=) - -DECLOP_3VAR_COMP(short3, &&) -DECLOP_3VAR_COMP(short3, ||) - -DECLOP_3VAR_1IN_1OUT(short3, ~) -DECLOP_3VAR_1IN_BOOLOUT(short3, !) - -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(short3, float) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(short3, double) -DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(short3, signed long long) - -// SIGNED SHORT4 - -DECLOP_4VAR_2IN_1OUT(short4, +) -DECLOP_4VAR_2IN_1OUT(short4, -) -DECLOP_4VAR_2IN_1OUT(short4, *) -DECLOP_4VAR_2IN_1OUT(short4, /) -DECLOP_4VAR_2IN_1OUT(short4, %) -DECLOP_4VAR_2IN_1OUT(short4, &) -DECLOP_4VAR_2IN_1OUT(short4, |) -DECLOP_4VAR_2IN_1OUT(short4, ^) -DECLOP_4VAR_2IN_1OUT(short4, <<) -DECLOP_4VAR_2IN_1OUT(short4, >>) - -DECLOP_4VAR_ASSIGN(short4, +=) -DECLOP_4VAR_ASSIGN(short4, -=) -DECLOP_4VAR_ASSIGN(short4, *=) -DECLOP_4VAR_ASSIGN(short4, /=) -DECLOP_4VAR_ASSIGN(short4, %=) -DECLOP_4VAR_ASSIGN(short4, &=) -DECLOP_4VAR_ASSIGN(short4, |=) -DECLOP_4VAR_ASSIGN(short4, ^=) -DECLOP_4VAR_ASSIGN(short4, <<=) -DECLOP_4VAR_ASSIGN(short4, >>=) - -DECLOP_4VAR_PREOP(short4, ++) -DECLOP_4VAR_PREOP(short4, --) - -DECLOP_4VAR_POSTOP(short4, ++) -DECLOP_4VAR_POSTOP(short4, --) - -DECLOP_4VAR_COMP(short4, ==) -DECLOP_4VAR_COMP(short4, !=) -DECLOP_4VAR_COMP(short4, <) -DECLOP_4VAR_COMP(short4, >) -DECLOP_4VAR_COMP(short4, <=) -DECLOP_4VAR_COMP(short4, >=) - -DECLOP_4VAR_COMP(short4, &&) -DECLOP_4VAR_COMP(short4, ||) - -DECLOP_4VAR_1IN_1OUT(short4, ~) -DECLOP_4VAR_1IN_BOOLOUT(short4, !) - -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(short4, float) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(short4, double) -DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(short4, signed long long) - -// UNSIGNED INT1 - -DECLOP_1VAR_2IN_1OUT(uint1, +) -DECLOP_1VAR_2IN_1OUT(uint1, -) -DECLOP_1VAR_2IN_1OUT(uint1, *) -DECLOP_1VAR_2IN_1OUT(uint1, /) -DECLOP_1VAR_2IN_1OUT(uint1, %) -DECLOP_1VAR_2IN_1OUT(uint1, &) -DECLOP_1VAR_2IN_1OUT(uint1, |) -DECLOP_1VAR_2IN_1OUT(uint1, ^) -DECLOP_1VAR_2IN_1OUT(uint1, <<) -DECLOP_1VAR_2IN_1OUT(uint1, >>) - - -DECLOP_1VAR_ASSIGN(uint1, +=) -DECLOP_1VAR_ASSIGN(uint1, -=) -DECLOP_1VAR_ASSIGN(uint1, *=) -DECLOP_1VAR_ASSIGN(uint1, /=) -DECLOP_1VAR_ASSIGN(uint1, %=) -DECLOP_1VAR_ASSIGN(uint1, &=) -DECLOP_1VAR_ASSIGN(uint1, |=) -DECLOP_1VAR_ASSIGN(uint1, ^=) -DECLOP_1VAR_ASSIGN(uint1, <<=) -DECLOP_1VAR_ASSIGN(uint1, >>=) - -DECLOP_1VAR_PREOP(uint1, ++) -DECLOP_1VAR_PREOP(uint1, --) - -DECLOP_1VAR_POSTOP(uint1, ++) -DECLOP_1VAR_POSTOP(uint1, --) - -DECLOP_1VAR_COMP(uint1, ==) -DECLOP_1VAR_COMP(uint1, !=) -DECLOP_1VAR_COMP(uint1, <) -DECLOP_1VAR_COMP(uint1, >) -DECLOP_1VAR_COMP(uint1, <=) -DECLOP_1VAR_COMP(uint1, >=) - -DECLOP_1VAR_COMP(uint1, &&) -DECLOP_1VAR_COMP(uint1, ||) - -DECLOP_1VAR_1IN_1OUT(uint1, ~) -DECLOP_1VAR_1IN_BOOLOUT(uint1, !) - -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(uint1, float) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(uint1, double) -DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long long) - -// UNSIGNED INT2 - -DECLOP_2VAR_2IN_1OUT(uint2, +) -DECLOP_2VAR_2IN_1OUT(uint2, -) -DECLOP_2VAR_2IN_1OUT(uint2, *) -DECLOP_2VAR_2IN_1OUT(uint2, /) -DECLOP_2VAR_2IN_1OUT(uint2, %) -DECLOP_2VAR_2IN_1OUT(uint2, &) -DECLOP_2VAR_2IN_1OUT(uint2, |) -DECLOP_2VAR_2IN_1OUT(uint2, ^) -DECLOP_2VAR_2IN_1OUT(uint2, <<) -DECLOP_2VAR_2IN_1OUT(uint2, >>) - -DECLOP_2VAR_ASSIGN(uint2, +=) -DECLOP_2VAR_ASSIGN(uint2, -=) -DECLOP_2VAR_ASSIGN(uint2, *=) -DECLOP_2VAR_ASSIGN(uint2, /=) -DECLOP_2VAR_ASSIGN(uint2, %=) -DECLOP_2VAR_ASSIGN(uint2, &=) -DECLOP_2VAR_ASSIGN(uint2, |=) -DECLOP_2VAR_ASSIGN(uint2, ^=) -DECLOP_2VAR_ASSIGN(uint2, <<=) -DECLOP_2VAR_ASSIGN(uint2, >>=) - -DECLOP_2VAR_PREOP(uint2, ++) -DECLOP_2VAR_PREOP(uint2, --) - -DECLOP_2VAR_POSTOP(uint2, ++) -DECLOP_2VAR_POSTOP(uint2, --) - -DECLOP_2VAR_COMP(uint2, ==) -DECLOP_2VAR_COMP(uint2, !=) -DECLOP_2VAR_COMP(uint2, <) -DECLOP_2VAR_COMP(uint2, >) -DECLOP_2VAR_COMP(uint2, <=) -DECLOP_2VAR_COMP(uint2, >=) - -DECLOP_2VAR_COMP(uint2, &&) -DECLOP_2VAR_COMP(uint2, ||) - -DECLOP_2VAR_1IN_1OUT(uint2, ~) -DECLOP_2VAR_1IN_BOOLOUT(uint2, !) - -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(uint2, float) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(uint2, double) -DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long long) - -// UNSIGNED INT3 - -DECLOP_3VAR_2IN_1OUT(uint3, +) -DECLOP_3VAR_2IN_1OUT(uint3, -) -DECLOP_3VAR_2IN_1OUT(uint3, *) -DECLOP_3VAR_2IN_1OUT(uint3, /) -DECLOP_3VAR_2IN_1OUT(uint3, %) -DECLOP_3VAR_2IN_1OUT(uint3, &) -DECLOP_3VAR_2IN_1OUT(uint3, |) -DECLOP_3VAR_2IN_1OUT(uint3, ^) -DECLOP_3VAR_2IN_1OUT(uint3, <<) -DECLOP_3VAR_2IN_1OUT(uint3, >>) - -DECLOP_3VAR_ASSIGN(uint3, +=) -DECLOP_3VAR_ASSIGN(uint3, -=) -DECLOP_3VAR_ASSIGN(uint3, *=) -DECLOP_3VAR_ASSIGN(uint3, /=) -DECLOP_3VAR_ASSIGN(uint3, %=) -DECLOP_3VAR_ASSIGN(uint3, &=) -DECLOP_3VAR_ASSIGN(uint3, |=) -DECLOP_3VAR_ASSIGN(uint3, ^=) -DECLOP_3VAR_ASSIGN(uint3, <<=) -DECLOP_3VAR_ASSIGN(uint3, >>=) - -DECLOP_3VAR_PREOP(uint3, ++) -DECLOP_3VAR_PREOP(uint3, --) - -DECLOP_3VAR_POSTOP(uint3, ++) -DECLOP_3VAR_POSTOP(uint3, --) - -DECLOP_3VAR_COMP(uint3, ==) -DECLOP_3VAR_COMP(uint3, !=) -DECLOP_3VAR_COMP(uint3, <) -DECLOP_3VAR_COMP(uint3, >) -DECLOP_3VAR_COMP(uint3, <=) -DECLOP_3VAR_COMP(uint3, >=) - -DECLOP_3VAR_COMP(uint3, &&) -DECLOP_3VAR_COMP(uint3, ||) - -DECLOP_3VAR_1IN_1OUT(uint3, ~) -DECLOP_3VAR_1IN_BOOLOUT(uint3, !) - -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(uint3, float) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(uint3, double) -DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long long) - -// UNSIGNED INT4 - -DECLOP_4VAR_2IN_1OUT(uint4, +) -DECLOP_4VAR_2IN_1OUT(uint4, -) -DECLOP_4VAR_2IN_1OUT(uint4, *) -DECLOP_4VAR_2IN_1OUT(uint4, /) -DECLOP_4VAR_2IN_1OUT(uint4, %) -DECLOP_4VAR_2IN_1OUT(uint4, &) -DECLOP_4VAR_2IN_1OUT(uint4, |) -DECLOP_4VAR_2IN_1OUT(uint4, ^) -DECLOP_4VAR_2IN_1OUT(uint4, <<) -DECLOP_4VAR_2IN_1OUT(uint4, >>) - -DECLOP_4VAR_ASSIGN(uint4, +=) -DECLOP_4VAR_ASSIGN(uint4, -=) -DECLOP_4VAR_ASSIGN(uint4, *=) -DECLOP_4VAR_ASSIGN(uint4, /=) -DECLOP_4VAR_ASSIGN(uint4, %=) -DECLOP_4VAR_ASSIGN(uint4, &=) -DECLOP_4VAR_ASSIGN(uint4, |=) -DECLOP_4VAR_ASSIGN(uint4, ^=) -DECLOP_4VAR_ASSIGN(uint4, <<=) -DECLOP_4VAR_ASSIGN(uint4, >>=) - -DECLOP_4VAR_PREOP(uint4, ++) -DECLOP_4VAR_PREOP(uint4, --) - -DECLOP_4VAR_POSTOP(uint4, ++) -DECLOP_4VAR_POSTOP(uint4, --) - -DECLOP_4VAR_COMP(uint4, ==) -DECLOP_4VAR_COMP(uint4, !=) -DECLOP_4VAR_COMP(uint4, <) -DECLOP_4VAR_COMP(uint4, >) -DECLOP_4VAR_COMP(uint4, <=) -DECLOP_4VAR_COMP(uint4, >=) - -DECLOP_4VAR_COMP(uint4, &&) -DECLOP_4VAR_COMP(uint4, ||) - -DECLOP_4VAR_1IN_1OUT(uint4, ~) -DECLOP_4VAR_1IN_BOOLOUT(uint4, !) - -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(uint4, float) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(uint4, double) -DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long long) - -// SIGNED INT1 - -DECLOP_1VAR_2IN_1OUT(int1, +) -DECLOP_1VAR_2IN_1OUT(int1, -) -DECLOP_1VAR_2IN_1OUT(int1, *) -DECLOP_1VAR_2IN_1OUT(int1, /) -DECLOP_1VAR_2IN_1OUT(int1, %) -DECLOP_1VAR_2IN_1OUT(int1, &) -DECLOP_1VAR_2IN_1OUT(int1, |) -DECLOP_1VAR_2IN_1OUT(int1, ^) -DECLOP_1VAR_2IN_1OUT(int1, <<) -DECLOP_1VAR_2IN_1OUT(int1, >>) - - -DECLOP_1VAR_ASSIGN(int1, +=) -DECLOP_1VAR_ASSIGN(int1, -=) -DECLOP_1VAR_ASSIGN(int1, *=) -DECLOP_1VAR_ASSIGN(int1, /=) -DECLOP_1VAR_ASSIGN(int1, %=) -DECLOP_1VAR_ASSIGN(int1, &=) -DECLOP_1VAR_ASSIGN(int1, |=) -DECLOP_1VAR_ASSIGN(int1, ^=) -DECLOP_1VAR_ASSIGN(int1, <<=) -DECLOP_1VAR_ASSIGN(int1, >>=) - -DECLOP_1VAR_PREOP(int1, ++) -DECLOP_1VAR_PREOP(int1, --) - -DECLOP_1VAR_POSTOP(int1, ++) -DECLOP_1VAR_POSTOP(int1, --) - -DECLOP_1VAR_COMP(int1, ==) -DECLOP_1VAR_COMP(int1, !=) -DECLOP_1VAR_COMP(int1, <) -DECLOP_1VAR_COMP(int1, >) -DECLOP_1VAR_COMP(int1, <=) -DECLOP_1VAR_COMP(int1, >=) - -DECLOP_1VAR_COMP(int1, &&) -DECLOP_1VAR_COMP(int1, ||) - -DECLOP_1VAR_1IN_1OUT(int1, ~) -DECLOP_1VAR_1IN_BOOLOUT(int1, !) - -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(int1, float) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(int1, double) -DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(int1, signed long long) - -// SIGNED INT2 - -DECLOP_2VAR_2IN_1OUT(int2, +) -DECLOP_2VAR_2IN_1OUT(int2, -) -DECLOP_2VAR_2IN_1OUT(int2, *) -DECLOP_2VAR_2IN_1OUT(int2, /) -DECLOP_2VAR_2IN_1OUT(int2, %) -DECLOP_2VAR_2IN_1OUT(int2, &) -DECLOP_2VAR_2IN_1OUT(int2, |) -DECLOP_2VAR_2IN_1OUT(int2, ^) -DECLOP_2VAR_2IN_1OUT(int2, <<) -DECLOP_2VAR_2IN_1OUT(int2, >>) - -DECLOP_2VAR_ASSIGN(int2, +=) -DECLOP_2VAR_ASSIGN(int2, -=) -DECLOP_2VAR_ASSIGN(int2, *=) -DECLOP_2VAR_ASSIGN(int2, /=) -DECLOP_2VAR_ASSIGN(int2, %=) -DECLOP_2VAR_ASSIGN(int2, &=) -DECLOP_2VAR_ASSIGN(int2, |=) -DECLOP_2VAR_ASSIGN(int2, ^=) -DECLOP_2VAR_ASSIGN(int2, <<=) -DECLOP_2VAR_ASSIGN(int2, >>=) - -DECLOP_2VAR_PREOP(int2, ++) -DECLOP_2VAR_PREOP(int2, --) - -DECLOP_2VAR_POSTOP(int2, ++) -DECLOP_2VAR_POSTOP(int2, --) - -DECLOP_2VAR_COMP(int2, ==) -DECLOP_2VAR_COMP(int2, !=) -DECLOP_2VAR_COMP(int2, <) -DECLOP_2VAR_COMP(int2, >) -DECLOP_2VAR_COMP(int2, <=) -DECLOP_2VAR_COMP(int2, >=) - -DECLOP_2VAR_COMP(int2, &&) -DECLOP_2VAR_COMP(int2, ||) - -DECLOP_2VAR_1IN_1OUT(int2, ~) -DECLOP_2VAR_1IN_BOOLOUT(int2, !) - -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(int2, float) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(int2, double) -DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(int2, signed long long) - -// SIGNED INT3 - -DECLOP_3VAR_2IN_1OUT(int3, +) -DECLOP_3VAR_2IN_1OUT(int3, -) -DECLOP_3VAR_2IN_1OUT(int3, *) -DECLOP_3VAR_2IN_1OUT(int3, /) -DECLOP_3VAR_2IN_1OUT(int3, %) -DECLOP_3VAR_2IN_1OUT(int3, &) -DECLOP_3VAR_2IN_1OUT(int3, |) -DECLOP_3VAR_2IN_1OUT(int3, ^) -DECLOP_3VAR_2IN_1OUT(int3, <<) -DECLOP_3VAR_2IN_1OUT(int3, >>) - -DECLOP_3VAR_ASSIGN(int3, +=) -DECLOP_3VAR_ASSIGN(int3, -=) -DECLOP_3VAR_ASSIGN(int3, *=) -DECLOP_3VAR_ASSIGN(int3, /=) -DECLOP_3VAR_ASSIGN(int3, %=) -DECLOP_3VAR_ASSIGN(int3, &=) -DECLOP_3VAR_ASSIGN(int3, |=) -DECLOP_3VAR_ASSIGN(int3, ^=) -DECLOP_3VAR_ASSIGN(int3, <<=) -DECLOP_3VAR_ASSIGN(int3, >>=) - -DECLOP_3VAR_PREOP(int3, ++) -DECLOP_3VAR_PREOP(int3, --) - -DECLOP_3VAR_POSTOP(int3, ++) -DECLOP_3VAR_POSTOP(int3, --) - -DECLOP_3VAR_COMP(int3, ==) -DECLOP_3VAR_COMP(int3, !=) -DECLOP_3VAR_COMP(int3, <) -DECLOP_3VAR_COMP(int3, >) -DECLOP_3VAR_COMP(int3, <=) -DECLOP_3VAR_COMP(int3, >=) - -DECLOP_3VAR_COMP(int3, &&) -DECLOP_3VAR_COMP(int3, ||) - -DECLOP_3VAR_1IN_1OUT(int3, ~) -DECLOP_3VAR_1IN_BOOLOUT(int3, !) - -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(int3, float) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(int3, double) -DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(int3, signed long long) - -// SIGNED INT4 - -DECLOP_4VAR_2IN_1OUT(int4, +) -DECLOP_4VAR_2IN_1OUT(int4, -) -DECLOP_4VAR_2IN_1OUT(int4, *) -DECLOP_4VAR_2IN_1OUT(int4, /) -DECLOP_4VAR_2IN_1OUT(int4, %) -DECLOP_4VAR_2IN_1OUT(int4, &) -DECLOP_4VAR_2IN_1OUT(int4, |) -DECLOP_4VAR_2IN_1OUT(int4, ^) -DECLOP_4VAR_2IN_1OUT(int4, <<) -DECLOP_4VAR_2IN_1OUT(int4, >>) - -DECLOP_4VAR_ASSIGN(int4, +=) -DECLOP_4VAR_ASSIGN(int4, -=) -DECLOP_4VAR_ASSIGN(int4, *=) -DECLOP_4VAR_ASSIGN(int4, /=) -DECLOP_4VAR_ASSIGN(int4, %=) -DECLOP_4VAR_ASSIGN(int4, &=) -DECLOP_4VAR_ASSIGN(int4, |=) -DECLOP_4VAR_ASSIGN(int4, ^=) -DECLOP_4VAR_ASSIGN(int4, <<=) -DECLOP_4VAR_ASSIGN(int4, >>=) - -DECLOP_4VAR_PREOP(int4, ++) -DECLOP_4VAR_PREOP(int4, --) - -DECLOP_4VAR_POSTOP(int4, ++) -DECLOP_4VAR_POSTOP(int4, --) - -DECLOP_4VAR_COMP(int4, ==) -DECLOP_4VAR_COMP(int4, !=) -DECLOP_4VAR_COMP(int4, <) -DECLOP_4VAR_COMP(int4, >) -DECLOP_4VAR_COMP(int4, <=) -DECLOP_4VAR_COMP(int4, >=) - -DECLOP_4VAR_COMP(int4, &&) -DECLOP_4VAR_COMP(int4, ||) - -DECLOP_4VAR_1IN_1OUT(int4, ~) -DECLOP_4VAR_1IN_BOOLOUT(int4, !) - -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(int4, float) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(int4, double) -DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(int4, signed long long) - -// FLOAT1 - -DECLOP_1VAR_2IN_1OUT(float1, +) -DECLOP_1VAR_2IN_1OUT(float1, -) -DECLOP_1VAR_2IN_1OUT(float1, *) -DECLOP_1VAR_2IN_1OUT(float1, /) - -DECLOP_1VAR_ASSIGN(float1, +=) -DECLOP_1VAR_ASSIGN(float1, -=) -DECLOP_1VAR_ASSIGN(float1, *=) -DECLOP_1VAR_ASSIGN(float1, /=) - -DECLOP_1VAR_PREOP(float1, ++) -DECLOP_1VAR_PREOP(float1, --) - -DECLOP_1VAR_POSTOP(float1, ++) -DECLOP_1VAR_POSTOP(float1, --) - -DECLOP_1VAR_COMP(float1, ==) -DECLOP_1VAR_COMP(float1, !=) -DECLOP_1VAR_COMP(float1, <) -DECLOP_1VAR_COMP(float1, >) -DECLOP_1VAR_COMP(float1, <=) -DECLOP_1VAR_COMP(float1, >=) - -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(float1, float) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(float1, double) -DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(float1, signed long long) - -// FLOAT2 - -DECLOP_2VAR_2IN_1OUT(float2, +) -DECLOP_2VAR_2IN_1OUT(float2, -) -DECLOP_2VAR_2IN_1OUT(float2, *) -DECLOP_2VAR_2IN_1OUT(float2, /) - -DECLOP_2VAR_ASSIGN(float2, +=) -DECLOP_2VAR_ASSIGN(float2, -=) -DECLOP_2VAR_ASSIGN(float2, *=) -DECLOP_2VAR_ASSIGN(float2, /=) - -DECLOP_2VAR_PREOP(float2, ++) -DECLOP_2VAR_PREOP(float2, --) - -DECLOP_2VAR_POSTOP(float2, ++) -DECLOP_2VAR_POSTOP(float2, --) - -DECLOP_2VAR_COMP(float2, ==) -DECLOP_2VAR_COMP(float2, !=) -DECLOP_2VAR_COMP(float2, <) -DECLOP_2VAR_COMP(float2, >) -DECLOP_2VAR_COMP(float2, <=) -DECLOP_2VAR_COMP(float2, >=) - -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(float2, float) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(float2, double) -DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(float2, signed long long) - -// FLOAT3 - -DECLOP_3VAR_2IN_1OUT(float3, +) -DECLOP_3VAR_2IN_1OUT(float3, -) -DECLOP_3VAR_2IN_1OUT(float3, *) -DECLOP_3VAR_2IN_1OUT(float3, /) - -DECLOP_3VAR_ASSIGN(float3, +=) -DECLOP_3VAR_ASSIGN(float3, -=) -DECLOP_3VAR_ASSIGN(float3, *=) -DECLOP_3VAR_ASSIGN(float3, /=) - -DECLOP_3VAR_PREOP(float3, ++) -DECLOP_3VAR_PREOP(float3, --) - -DECLOP_3VAR_POSTOP(float3, ++) -DECLOP_3VAR_POSTOP(float3, --) - -DECLOP_3VAR_COMP(float3, ==) -DECLOP_3VAR_COMP(float3, !=) -DECLOP_3VAR_COMP(float3, <) -DECLOP_3VAR_COMP(float3, >) -DECLOP_3VAR_COMP(float3, <=) -DECLOP_3VAR_COMP(float3, >=) - -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(float3, float) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(float3, double) -DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(float3, signed long long) - -// FLOAT4 - -DECLOP_4VAR_2IN_1OUT(float4, +) -DECLOP_4VAR_2IN_1OUT(float4, -) -DECLOP_4VAR_2IN_1OUT(float4, *) -DECLOP_4VAR_2IN_1OUT(float4, /) - -DECLOP_4VAR_ASSIGN(float4, +=) -DECLOP_4VAR_ASSIGN(float4, -=) -DECLOP_4VAR_ASSIGN(float4, *=) -DECLOP_4VAR_ASSIGN(float4, /=) - -DECLOP_4VAR_PREOP(float4, ++) -DECLOP_4VAR_PREOP(float4, --) - -DECLOP_4VAR_POSTOP(float4, ++) -DECLOP_4VAR_POSTOP(float4, --) - -DECLOP_4VAR_COMP(float4, ==) -DECLOP_4VAR_COMP(float4, !=) -DECLOP_4VAR_COMP(float4, <) -DECLOP_4VAR_COMP(float4, >) -DECLOP_4VAR_COMP(float4, <=) -DECLOP_4VAR_COMP(float4, >=) - -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(float4, float) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(float4, double) -DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(float4, signed long long) - -// DOUBLE1 - -DECLOP_1VAR_2IN_1OUT(double1, +) -DECLOP_1VAR_2IN_1OUT(double1, -) -DECLOP_1VAR_2IN_1OUT(double1, *) -DECLOP_1VAR_2IN_1OUT(double1, /) - -DECLOP_1VAR_ASSIGN(double1, +=) -DECLOP_1VAR_ASSIGN(double1, -=) -DECLOP_1VAR_ASSIGN(double1, *=) -DECLOP_1VAR_ASSIGN(double1, /=) - -DECLOP_1VAR_PREOP(double1, ++) -DECLOP_1VAR_PREOP(double1, --) - -DECLOP_1VAR_POSTOP(double1, ++) -DECLOP_1VAR_POSTOP(double1, --) - -DECLOP_1VAR_COMP(double1, ==) -DECLOP_1VAR_COMP(double1, !=) -DECLOP_1VAR_COMP(double1, <) -DECLOP_1VAR_COMP(double1, >) -DECLOP_1VAR_COMP(double1, <=) -DECLOP_1VAR_COMP(double1, >=) - -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(double1, float) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(double1, double) -DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(double1, signed long long) - -// DOUBLE2 - -DECLOP_2VAR_2IN_1OUT(double2, +) -DECLOP_2VAR_2IN_1OUT(double2, -) -DECLOP_2VAR_2IN_1OUT(double2, *) -DECLOP_2VAR_2IN_1OUT(double2, /) - -DECLOP_2VAR_ASSIGN(double2, +=) -DECLOP_2VAR_ASSIGN(double2, -=) -DECLOP_2VAR_ASSIGN(double2, *=) -DECLOP_2VAR_ASSIGN(double2, /=) - -DECLOP_2VAR_PREOP(double2, ++) -DECLOP_2VAR_PREOP(double2, --) - -DECLOP_2VAR_POSTOP(double2, ++) -DECLOP_2VAR_POSTOP(double2, --) - -DECLOP_2VAR_COMP(double2, ==) -DECLOP_2VAR_COMP(double2, !=) -DECLOP_2VAR_COMP(double2, <) -DECLOP_2VAR_COMP(double2, >) -DECLOP_2VAR_COMP(double2, <=) -DECLOP_2VAR_COMP(double2, >=) - -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(double2, float) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(double2, double) -DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(double2, signed long long) - -// DOUBLE3 - -DECLOP_3VAR_2IN_1OUT(double3, +) -DECLOP_3VAR_2IN_1OUT(double3, -) -DECLOP_3VAR_2IN_1OUT(double3, *) -DECLOP_3VAR_2IN_1OUT(double3, /) - -DECLOP_3VAR_ASSIGN(double3, +=) -DECLOP_3VAR_ASSIGN(double3, -=) -DECLOP_3VAR_ASSIGN(double3, *=) -DECLOP_3VAR_ASSIGN(double3, /=) - -DECLOP_3VAR_PREOP(double3, ++) -DECLOP_3VAR_PREOP(double3, --) - -DECLOP_3VAR_POSTOP(double3, ++) -DECLOP_3VAR_POSTOP(double3, --) - -DECLOP_3VAR_COMP(double3, ==) -DECLOP_3VAR_COMP(double3, !=) -DECLOP_3VAR_COMP(double3, <) -DECLOP_3VAR_COMP(double3, >) -DECLOP_3VAR_COMP(double3, <=) -DECLOP_3VAR_COMP(double3, >=) - -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(double3, float) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(double3, double) -DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(double3, signed long long) - -// DOUBLE4 - -DECLOP_4VAR_2IN_1OUT(double4, +) -DECLOP_4VAR_2IN_1OUT(double4, -) -DECLOP_4VAR_2IN_1OUT(double4, *) -DECLOP_4VAR_2IN_1OUT(double4, /) - -DECLOP_4VAR_ASSIGN(double4, +=) -DECLOP_4VAR_ASSIGN(double4, -=) -DECLOP_4VAR_ASSIGN(double4, *=) -DECLOP_4VAR_ASSIGN(double4, /=) - -DECLOP_4VAR_PREOP(double4, ++) -DECLOP_4VAR_PREOP(double4, --) - -DECLOP_4VAR_POSTOP(double4, ++) -DECLOP_4VAR_POSTOP(double4, --) - -DECLOP_4VAR_COMP(double4, ==) -DECLOP_4VAR_COMP(double4, !=) -DECLOP_4VAR_COMP(double4, <) -DECLOP_4VAR_COMP(double4, >) -DECLOP_4VAR_COMP(double4, <=) -DECLOP_4VAR_COMP(double4, >=) - -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(double4, float) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(double4, double) -DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(double4, signed long long) - -// UNSIGNED LONG1 - -DECLOP_1VAR_2IN_1OUT(ulong1, +) -DECLOP_1VAR_2IN_1OUT(ulong1, -) -DECLOP_1VAR_2IN_1OUT(ulong1, *) -DECLOP_1VAR_2IN_1OUT(ulong1, /) -DECLOP_1VAR_2IN_1OUT(ulong1, %) -DECLOP_1VAR_2IN_1OUT(ulong1, &) -DECLOP_1VAR_2IN_1OUT(ulong1, |) -DECLOP_1VAR_2IN_1OUT(ulong1, ^) -DECLOP_1VAR_2IN_1OUT(ulong1, <<) -DECLOP_1VAR_2IN_1OUT(ulong1, >>) - - -DECLOP_1VAR_ASSIGN(ulong1, +=) -DECLOP_1VAR_ASSIGN(ulong1, -=) -DECLOP_1VAR_ASSIGN(ulong1, *=) -DECLOP_1VAR_ASSIGN(ulong1, /=) -DECLOP_1VAR_ASSIGN(ulong1, %=) -DECLOP_1VAR_ASSIGN(ulong1, &=) -DECLOP_1VAR_ASSIGN(ulong1, |=) -DECLOP_1VAR_ASSIGN(ulong1, ^=) -DECLOP_1VAR_ASSIGN(ulong1, <<=) -DECLOP_1VAR_ASSIGN(ulong1, >>=) - -DECLOP_1VAR_PREOP(ulong1, ++) -DECLOP_1VAR_PREOP(ulong1, --) - -DECLOP_1VAR_POSTOP(ulong1, ++) -DECLOP_1VAR_POSTOP(ulong1, --) - -DECLOP_1VAR_COMP(ulong1, ==) -DECLOP_1VAR_COMP(ulong1, !=) -DECLOP_1VAR_COMP(ulong1, <) -DECLOP_1VAR_COMP(ulong1, >) -DECLOP_1VAR_COMP(ulong1, <=) -DECLOP_1VAR_COMP(ulong1, >=) - -DECLOP_1VAR_COMP(ulong1, &&) -DECLOP_1VAR_COMP(ulong1, ||) - -DECLOP_1VAR_1IN_1OUT(ulong1, ~) -DECLOP_1VAR_1IN_BOOLOUT(ulong1, !) - -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, float) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, double) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long long) - -// UNSIGNED LONG2 - -DECLOP_2VAR_2IN_1OUT(ulong2, +) -DECLOP_2VAR_2IN_1OUT(ulong2, -) -DECLOP_2VAR_2IN_1OUT(ulong2, *) -DECLOP_2VAR_2IN_1OUT(ulong2, /) -DECLOP_2VAR_2IN_1OUT(ulong2, %) -DECLOP_2VAR_2IN_1OUT(ulong2, &) -DECLOP_2VAR_2IN_1OUT(ulong2, |) -DECLOP_2VAR_2IN_1OUT(ulong2, ^) -DECLOP_2VAR_2IN_1OUT(ulong2, <<) -DECLOP_2VAR_2IN_1OUT(ulong2, >>) - -DECLOP_2VAR_ASSIGN(ulong2, +=) -DECLOP_2VAR_ASSIGN(ulong2, -=) -DECLOP_2VAR_ASSIGN(ulong2, *=) -DECLOP_2VAR_ASSIGN(ulong2, /=) -DECLOP_2VAR_ASSIGN(ulong2, %=) -DECLOP_2VAR_ASSIGN(ulong2, &=) -DECLOP_2VAR_ASSIGN(ulong2, |=) -DECLOP_2VAR_ASSIGN(ulong2, ^=) -DECLOP_2VAR_ASSIGN(ulong2, <<=) -DECLOP_2VAR_ASSIGN(ulong2, >>=) - -DECLOP_2VAR_PREOP(ulong2, ++) -DECLOP_2VAR_PREOP(ulong2, --) - -DECLOP_2VAR_POSTOP(ulong2, ++) -DECLOP_2VAR_POSTOP(ulong2, --) - -DECLOP_2VAR_COMP(ulong2, ==) -DECLOP_2VAR_COMP(ulong2, !=) -DECLOP_2VAR_COMP(ulong2, <) -DECLOP_2VAR_COMP(ulong2, >) -DECLOP_2VAR_COMP(ulong2, <=) -DECLOP_2VAR_COMP(ulong2, >=) - -DECLOP_2VAR_COMP(ulong2, &&) -DECLOP_2VAR_COMP(ulong2, ||) - -DECLOP_2VAR_1IN_1OUT(ulong2, ~) -DECLOP_2VAR_1IN_BOOLOUT(ulong2, !) - -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, float) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, double) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long long) - -// UNSIGNED LONG3 - -DECLOP_3VAR_2IN_1OUT(ulong3, +) -DECLOP_3VAR_2IN_1OUT(ulong3, -) -DECLOP_3VAR_2IN_1OUT(ulong3, *) -DECLOP_3VAR_2IN_1OUT(ulong3, /) -DECLOP_3VAR_2IN_1OUT(ulong3, %) -DECLOP_3VAR_2IN_1OUT(ulong3, &) -DECLOP_3VAR_2IN_1OUT(ulong3, |) -DECLOP_3VAR_2IN_1OUT(ulong3, ^) -DECLOP_3VAR_2IN_1OUT(ulong3, <<) -DECLOP_3VAR_2IN_1OUT(ulong3, >>) - -DECLOP_3VAR_ASSIGN(ulong3, +=) -DECLOP_3VAR_ASSIGN(ulong3, -=) -DECLOP_3VAR_ASSIGN(ulong3, *=) -DECLOP_3VAR_ASSIGN(ulong3, /=) -DECLOP_3VAR_ASSIGN(ulong3, %=) -DECLOP_3VAR_ASSIGN(ulong3, &=) -DECLOP_3VAR_ASSIGN(ulong3, |=) -DECLOP_3VAR_ASSIGN(ulong3, ^=) -DECLOP_3VAR_ASSIGN(ulong3, <<=) -DECLOP_3VAR_ASSIGN(ulong3, >>=) - -DECLOP_3VAR_PREOP(ulong3, ++) -DECLOP_3VAR_PREOP(ulong3, --) - -DECLOP_3VAR_POSTOP(ulong3, ++) -DECLOP_3VAR_POSTOP(ulong3, --) - -DECLOP_3VAR_COMP(ulong3, ==) -DECLOP_3VAR_COMP(ulong3, !=) -DECLOP_3VAR_COMP(ulong3, <) -DECLOP_3VAR_COMP(ulong3, >) -DECLOP_3VAR_COMP(ulong3, <=) -DECLOP_3VAR_COMP(ulong3, >=) - -DECLOP_3VAR_COMP(ulong3, &&) -DECLOP_3VAR_COMP(ulong3, ||) - -DECLOP_3VAR_1IN_1OUT(ulong3, ~) -DECLOP_3VAR_1IN_BOOLOUT(ulong3, !) - -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, float) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, double) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long long) - -// UNSIGNED LONG4 - -DECLOP_4VAR_2IN_1OUT(ulong4, +) -DECLOP_4VAR_2IN_1OUT(ulong4, -) -DECLOP_4VAR_2IN_1OUT(ulong4, *) -DECLOP_4VAR_2IN_1OUT(ulong4, /) -DECLOP_4VAR_2IN_1OUT(ulong4, %) -DECLOP_4VAR_2IN_1OUT(ulong4, &) -DECLOP_4VAR_2IN_1OUT(ulong4, |) -DECLOP_4VAR_2IN_1OUT(ulong4, ^) -DECLOP_4VAR_2IN_1OUT(ulong4, <<) -DECLOP_4VAR_2IN_1OUT(ulong4, >>) - -DECLOP_4VAR_ASSIGN(ulong4, +=) -DECLOP_4VAR_ASSIGN(ulong4, -=) -DECLOP_4VAR_ASSIGN(ulong4, *=) -DECLOP_4VAR_ASSIGN(ulong4, /=) -DECLOP_4VAR_ASSIGN(ulong4, %=) -DECLOP_4VAR_ASSIGN(ulong4, &=) -DECLOP_4VAR_ASSIGN(ulong4, |=) -DECLOP_4VAR_ASSIGN(ulong4, ^=) -DECLOP_4VAR_ASSIGN(ulong4, <<=) -DECLOP_4VAR_ASSIGN(ulong4, >>=) - -DECLOP_4VAR_PREOP(ulong4, ++) -DECLOP_4VAR_PREOP(ulong4, --) - -DECLOP_4VAR_POSTOP(ulong4, ++) -DECLOP_4VAR_POSTOP(ulong4, --) - -DECLOP_4VAR_COMP(ulong4, ==) -DECLOP_4VAR_COMP(ulong4, !=) -DECLOP_4VAR_COMP(ulong4, <) -DECLOP_4VAR_COMP(ulong4, >) -DECLOP_4VAR_COMP(ulong4, <=) -DECLOP_4VAR_COMP(ulong4, >=) - -DECLOP_4VAR_COMP(ulong4, &&) -DECLOP_4VAR_COMP(ulong4, ||) - -DECLOP_4VAR_1IN_1OUT(ulong4, ~) -DECLOP_4VAR_1IN_BOOLOUT(ulong4, !) - -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, float) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, double) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long long) - -// SIGNED LONG1 - -DECLOP_1VAR_2IN_1OUT(long1, +) -DECLOP_1VAR_2IN_1OUT(long1, -) -DECLOP_1VAR_2IN_1OUT(long1, *) -DECLOP_1VAR_2IN_1OUT(long1, /) -DECLOP_1VAR_2IN_1OUT(long1, %) -DECLOP_1VAR_2IN_1OUT(long1, &) -DECLOP_1VAR_2IN_1OUT(long1, |) -DECLOP_1VAR_2IN_1OUT(long1, ^) -DECLOP_1VAR_2IN_1OUT(long1, <<) -DECLOP_1VAR_2IN_1OUT(long1, >>) - - -DECLOP_1VAR_ASSIGN(long1, +=) -DECLOP_1VAR_ASSIGN(long1, -=) -DECLOP_1VAR_ASSIGN(long1, *=) -DECLOP_1VAR_ASSIGN(long1, /=) -DECLOP_1VAR_ASSIGN(long1, %=) -DECLOP_1VAR_ASSIGN(long1, &=) -DECLOP_1VAR_ASSIGN(long1, |=) -DECLOP_1VAR_ASSIGN(long1, ^=) -DECLOP_1VAR_ASSIGN(long1, <<=) -DECLOP_1VAR_ASSIGN(long1, >>=) - -DECLOP_1VAR_PREOP(long1, ++) -DECLOP_1VAR_PREOP(long1, --) - -DECLOP_1VAR_POSTOP(long1, ++) -DECLOP_1VAR_POSTOP(long1, --) - -DECLOP_1VAR_COMP(long1, ==) -DECLOP_1VAR_COMP(long1, !=) -DECLOP_1VAR_COMP(long1, <) -DECLOP_1VAR_COMP(long1, >) -DECLOP_1VAR_COMP(long1, <=) -DECLOP_1VAR_COMP(long1, >=) - -DECLOP_1VAR_COMP(long1, &&) -DECLOP_1VAR_COMP(long1, ||) - -DECLOP_1VAR_1IN_1OUT(long1, ~) -DECLOP_1VAR_1IN_BOOLOUT(long1, !) - -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(long1, float) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(long1, double) -DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(long1, signed long long) - -// SIGNED LONG2 - -DECLOP_2VAR_2IN_1OUT(long2, +) -DECLOP_2VAR_2IN_1OUT(long2, -) -DECLOP_2VAR_2IN_1OUT(long2, *) -DECLOP_2VAR_2IN_1OUT(long2, /) -DECLOP_2VAR_2IN_1OUT(long2, %) -DECLOP_2VAR_2IN_1OUT(long2, &) -DECLOP_2VAR_2IN_1OUT(long2, |) -DECLOP_2VAR_2IN_1OUT(long2, ^) -DECLOP_2VAR_2IN_1OUT(long2, <<) -DECLOP_2VAR_2IN_1OUT(long2, >>) - -DECLOP_2VAR_ASSIGN(long2, +=) -DECLOP_2VAR_ASSIGN(long2, -=) -DECLOP_2VAR_ASSIGN(long2, *=) -DECLOP_2VAR_ASSIGN(long2, /=) -DECLOP_2VAR_ASSIGN(long2, %=) -DECLOP_2VAR_ASSIGN(long2, &=) -DECLOP_2VAR_ASSIGN(long2, |=) -DECLOP_2VAR_ASSIGN(long2, ^=) -DECLOP_2VAR_ASSIGN(long2, <<=) -DECLOP_2VAR_ASSIGN(long2, >>=) - -DECLOP_2VAR_PREOP(long2, ++) -DECLOP_2VAR_PREOP(long2, --) - -DECLOP_2VAR_POSTOP(long2, ++) -DECLOP_2VAR_POSTOP(long2, --) - -DECLOP_2VAR_COMP(long2, ==) -DECLOP_2VAR_COMP(long2, !=) -DECLOP_2VAR_COMP(long2, <) -DECLOP_2VAR_COMP(long2, >) -DECLOP_2VAR_COMP(long2, <=) -DECLOP_2VAR_COMP(long2, >=) - -DECLOP_2VAR_COMP(long2, &&) -DECLOP_2VAR_COMP(long2, ||) - -DECLOP_2VAR_1IN_1OUT(long2, ~) -DECLOP_2VAR_1IN_BOOLOUT(long2, !) - -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(long2, float) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(long2, double) -DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(long2, signed long long) - -// SIGNED LONG3 - -DECLOP_3VAR_2IN_1OUT(long3, +) -DECLOP_3VAR_2IN_1OUT(long3, -) -DECLOP_3VAR_2IN_1OUT(long3, *) -DECLOP_3VAR_2IN_1OUT(long3, /) -DECLOP_3VAR_2IN_1OUT(long3, %) -DECLOP_3VAR_2IN_1OUT(long3, &) -DECLOP_3VAR_2IN_1OUT(long3, |) -DECLOP_3VAR_2IN_1OUT(long3, ^) -DECLOP_3VAR_2IN_1OUT(long3, <<) -DECLOP_3VAR_2IN_1OUT(long3, >>) - -DECLOP_3VAR_ASSIGN(long3, +=) -DECLOP_3VAR_ASSIGN(long3, -=) -DECLOP_3VAR_ASSIGN(long3, *=) -DECLOP_3VAR_ASSIGN(long3, /=) -DECLOP_3VAR_ASSIGN(long3, %=) -DECLOP_3VAR_ASSIGN(long3, &=) -DECLOP_3VAR_ASSIGN(long3, |=) -DECLOP_3VAR_ASSIGN(long3, ^=) -DECLOP_3VAR_ASSIGN(long3, <<=) -DECLOP_3VAR_ASSIGN(long3, >>=) - -DECLOP_3VAR_PREOP(long3, ++) -DECLOP_3VAR_PREOP(long3, --) - -DECLOP_3VAR_POSTOP(long3, ++) -DECLOP_3VAR_POSTOP(long3, --) - -DECLOP_3VAR_COMP(long3, ==) -DECLOP_3VAR_COMP(long3, !=) -DECLOP_3VAR_COMP(long3, <) -DECLOP_3VAR_COMP(long3, >) -DECLOP_3VAR_COMP(long3, <=) -DECLOP_3VAR_COMP(long3, >=) - -DECLOP_3VAR_COMP(long3, &&) -DECLOP_3VAR_COMP(long3, ||) - -DECLOP_3VAR_1IN_1OUT(long3, ~) -DECLOP_3VAR_1IN_BOOLOUT(long3, !) - -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(long3, float) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(long3, double) -DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(long3, signed long long) - -// SIGNED LONG4 - -DECLOP_4VAR_2IN_1OUT(long4, +) -DECLOP_4VAR_2IN_1OUT(long4, -) -DECLOP_4VAR_2IN_1OUT(long4, *) -DECLOP_4VAR_2IN_1OUT(long4, /) -DECLOP_4VAR_2IN_1OUT(long4, %) -DECLOP_4VAR_2IN_1OUT(long4, &) -DECLOP_4VAR_2IN_1OUT(long4, |) -DECLOP_4VAR_2IN_1OUT(long4, ^) -DECLOP_4VAR_2IN_1OUT(long4, <<) -DECLOP_4VAR_2IN_1OUT(long4, >>) - -DECLOP_4VAR_ASSIGN(long4, +=) -DECLOP_4VAR_ASSIGN(long4, -=) -DECLOP_4VAR_ASSIGN(long4, *=) -DECLOP_4VAR_ASSIGN(long4, /=) -DECLOP_4VAR_ASSIGN(long4, %=) -DECLOP_4VAR_ASSIGN(long4, &=) -DECLOP_4VAR_ASSIGN(long4, |=) -DECLOP_4VAR_ASSIGN(long4, ^=) -DECLOP_4VAR_ASSIGN(long4, <<=) -DECLOP_4VAR_ASSIGN(long4, >>=) - -DECLOP_4VAR_PREOP(long4, ++) -DECLOP_4VAR_PREOP(long4, --) - -DECLOP_4VAR_POSTOP(long4, ++) -DECLOP_4VAR_POSTOP(long4, --) - -DECLOP_4VAR_COMP(long4, ==) -DECLOP_4VAR_COMP(long4, !=) -DECLOP_4VAR_COMP(long4, <) -DECLOP_4VAR_COMP(long4, >) -DECLOP_4VAR_COMP(long4, <=) -DECLOP_4VAR_COMP(long4, >=) - -DECLOP_4VAR_COMP(long4, &&) -DECLOP_4VAR_COMP(long4, ||) - -DECLOP_4VAR_1IN_1OUT(long4, ~) -DECLOP_4VAR_1IN_BOOLOUT(long4, !) - -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(long4, float) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(long4, double) -DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(long4, signed long long) - -// UNSIGNED LONGLONG1 - -DECLOP_1VAR_2IN_1OUT(ulonglong1, +) -DECLOP_1VAR_2IN_1OUT(ulonglong1, -) -DECLOP_1VAR_2IN_1OUT(ulonglong1, *) -DECLOP_1VAR_2IN_1OUT(ulonglong1, /) -DECLOP_1VAR_2IN_1OUT(ulonglong1, %) -DECLOP_1VAR_2IN_1OUT(ulonglong1, &) -DECLOP_1VAR_2IN_1OUT(ulonglong1, |) -DECLOP_1VAR_2IN_1OUT(ulonglong1, ^) -DECLOP_1VAR_2IN_1OUT(ulonglong1, <<) -DECLOP_1VAR_2IN_1OUT(ulonglong1, >>) - - -DECLOP_1VAR_ASSIGN(ulonglong1, +=) -DECLOP_1VAR_ASSIGN(ulonglong1, -=) -DECLOP_1VAR_ASSIGN(ulonglong1, *=) -DECLOP_1VAR_ASSIGN(ulonglong1, /=) -DECLOP_1VAR_ASSIGN(ulonglong1, %=) -DECLOP_1VAR_ASSIGN(ulonglong1, &=) -DECLOP_1VAR_ASSIGN(ulonglong1, |=) -DECLOP_1VAR_ASSIGN(ulonglong1, ^=) -DECLOP_1VAR_ASSIGN(ulonglong1, <<=) -DECLOP_1VAR_ASSIGN(ulonglong1, >>=) - -DECLOP_1VAR_PREOP(ulonglong1, ++) -DECLOP_1VAR_PREOP(ulonglong1, --) - -DECLOP_1VAR_POSTOP(ulonglong1, ++) -DECLOP_1VAR_POSTOP(ulonglong1, --) - -DECLOP_1VAR_COMP(ulonglong1, ==) -DECLOP_1VAR_COMP(ulonglong1, !=) -DECLOP_1VAR_COMP(ulonglong1, <) -DECLOP_1VAR_COMP(ulonglong1, >) -DECLOP_1VAR_COMP(ulonglong1, <=) -DECLOP_1VAR_COMP(ulonglong1, >=) - -DECLOP_1VAR_COMP(ulonglong1, &&) -DECLOP_1VAR_COMP(ulonglong1, ||) - -DECLOP_1VAR_1IN_1OUT(ulonglong1, ~) -DECLOP_1VAR_1IN_BOOLOUT(ulonglong1, !) - -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, float) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, double) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long long) - -// UNSIGNED LONGLONG2 - -DECLOP_2VAR_2IN_1OUT(ulonglong2, +) -DECLOP_2VAR_2IN_1OUT(ulonglong2, -) -DECLOP_2VAR_2IN_1OUT(ulonglong2, *) -DECLOP_2VAR_2IN_1OUT(ulonglong2, /) -DECLOP_2VAR_2IN_1OUT(ulonglong2, %) -DECLOP_2VAR_2IN_1OUT(ulonglong2, &) -DECLOP_2VAR_2IN_1OUT(ulonglong2, |) -DECLOP_2VAR_2IN_1OUT(ulonglong2, ^) -DECLOP_2VAR_2IN_1OUT(ulonglong2, <<) -DECLOP_2VAR_2IN_1OUT(ulonglong2, >>) - -DECLOP_2VAR_ASSIGN(ulonglong2, +=) -DECLOP_2VAR_ASSIGN(ulonglong2, -=) -DECLOP_2VAR_ASSIGN(ulonglong2, *=) -DECLOP_2VAR_ASSIGN(ulonglong2, /=) -DECLOP_2VAR_ASSIGN(ulonglong2, %=) -DECLOP_2VAR_ASSIGN(ulonglong2, &=) -DECLOP_2VAR_ASSIGN(ulonglong2, |=) -DECLOP_2VAR_ASSIGN(ulonglong2, ^=) -DECLOP_2VAR_ASSIGN(ulonglong2, <<=) -DECLOP_2VAR_ASSIGN(ulonglong2, >>=) - -DECLOP_2VAR_PREOP(ulonglong2, ++) -DECLOP_2VAR_PREOP(ulonglong2, --) - -DECLOP_2VAR_POSTOP(ulonglong2, ++) -DECLOP_2VAR_POSTOP(ulonglong2, --) - -DECLOP_2VAR_COMP(ulonglong2, ==) -DECLOP_2VAR_COMP(ulonglong2, !=) -DECLOP_2VAR_COMP(ulonglong2, <) -DECLOP_2VAR_COMP(ulonglong2, >) -DECLOP_2VAR_COMP(ulonglong2, <=) -DECLOP_2VAR_COMP(ulonglong2, >=) - -DECLOP_2VAR_COMP(ulonglong2, &&) -DECLOP_2VAR_COMP(ulonglong2, ||) - -DECLOP_2VAR_1IN_1OUT(ulonglong2, ~) -DECLOP_2VAR_1IN_BOOLOUT(ulonglong2, !) - -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, float) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, double) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long long) - -// UNSIGNED LONGLONG3 - -DECLOP_3VAR_2IN_1OUT(ulonglong3, +) -DECLOP_3VAR_2IN_1OUT(ulonglong3, -) -DECLOP_3VAR_2IN_1OUT(ulonglong3, *) -DECLOP_3VAR_2IN_1OUT(ulonglong3, /) -DECLOP_3VAR_2IN_1OUT(ulonglong3, %) -DECLOP_3VAR_2IN_1OUT(ulonglong3, &) -DECLOP_3VAR_2IN_1OUT(ulonglong3, |) -DECLOP_3VAR_2IN_1OUT(ulonglong3, ^) -DECLOP_3VAR_2IN_1OUT(ulonglong3, <<) -DECLOP_3VAR_2IN_1OUT(ulonglong3, >>) - -DECLOP_3VAR_ASSIGN(ulonglong3, +=) -DECLOP_3VAR_ASSIGN(ulonglong3, -=) -DECLOP_3VAR_ASSIGN(ulonglong3, *=) -DECLOP_3VAR_ASSIGN(ulonglong3, /=) -DECLOP_3VAR_ASSIGN(ulonglong3, %=) -DECLOP_3VAR_ASSIGN(ulonglong3, &=) -DECLOP_3VAR_ASSIGN(ulonglong3, |=) -DECLOP_3VAR_ASSIGN(ulonglong3, ^=) -DECLOP_3VAR_ASSIGN(ulonglong3, <<=) -DECLOP_3VAR_ASSIGN(ulonglong3, >>=) - -DECLOP_3VAR_PREOP(ulonglong3, ++) -DECLOP_3VAR_PREOP(ulonglong3, --) - -DECLOP_3VAR_POSTOP(ulonglong3, ++) -DECLOP_3VAR_POSTOP(ulonglong3, --) - -DECLOP_3VAR_COMP(ulonglong3, ==) -DECLOP_3VAR_COMP(ulonglong3, !=) -DECLOP_3VAR_COMP(ulonglong3, <) -DECLOP_3VAR_COMP(ulonglong3, >) -DECLOP_3VAR_COMP(ulonglong3, <=) -DECLOP_3VAR_COMP(ulonglong3, >=) - -DECLOP_3VAR_COMP(ulonglong3, &&) -DECLOP_3VAR_COMP(ulonglong3, ||) - -DECLOP_3VAR_1IN_1OUT(ulonglong3, ~) -DECLOP_3VAR_1IN_BOOLOUT(ulonglong3, !) - -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, float) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, double) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long long) - -// UNSIGNED LONGLONG4 - -DECLOP_4VAR_2IN_1OUT(ulonglong4, +) -DECLOP_4VAR_2IN_1OUT(ulonglong4, -) -DECLOP_4VAR_2IN_1OUT(ulonglong4, *) -DECLOP_4VAR_2IN_1OUT(ulonglong4, /) -DECLOP_4VAR_2IN_1OUT(ulonglong4, %) -DECLOP_4VAR_2IN_1OUT(ulonglong4, &) -DECLOP_4VAR_2IN_1OUT(ulonglong4, |) -DECLOP_4VAR_2IN_1OUT(ulonglong4, ^) -DECLOP_4VAR_2IN_1OUT(ulonglong4, <<) -DECLOP_4VAR_2IN_1OUT(ulonglong4, >>) - -DECLOP_4VAR_ASSIGN(ulonglong4, +=) -DECLOP_4VAR_ASSIGN(ulonglong4, -=) -DECLOP_4VAR_ASSIGN(ulonglong4, *=) -DECLOP_4VAR_ASSIGN(ulonglong4, /=) -DECLOP_4VAR_ASSIGN(ulonglong4, %=) -DECLOP_4VAR_ASSIGN(ulonglong4, &=) -DECLOP_4VAR_ASSIGN(ulonglong4, |=) -DECLOP_4VAR_ASSIGN(ulonglong4, ^=) -DECLOP_4VAR_ASSIGN(ulonglong4, <<=) -DECLOP_4VAR_ASSIGN(ulonglong4, >>=) - -DECLOP_4VAR_PREOP(ulonglong4, ++) -DECLOP_4VAR_PREOP(ulonglong4, --) - -DECLOP_4VAR_POSTOP(ulonglong4, ++) -DECLOP_4VAR_POSTOP(ulonglong4, --) - -DECLOP_4VAR_COMP(ulonglong4, ==) -DECLOP_4VAR_COMP(ulonglong4, !=) -DECLOP_4VAR_COMP(ulonglong4, <) -DECLOP_4VAR_COMP(ulonglong4, >) -DECLOP_4VAR_COMP(ulonglong4, <=) -DECLOP_4VAR_COMP(ulonglong4, >=) - -DECLOP_4VAR_COMP(ulonglong4, &&) -DECLOP_4VAR_COMP(ulonglong4, ||) - -DECLOP_4VAR_1IN_1OUT(ulonglong4, ~) -DECLOP_4VAR_1IN_BOOLOUT(ulonglong4, !) - -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, float) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, double) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long long) - -// SIGNED LONGLONG1 - -DECLOP_1VAR_2IN_1OUT(longlong1, +) -DECLOP_1VAR_2IN_1OUT(longlong1, -) -DECLOP_1VAR_2IN_1OUT(longlong1, *) -DECLOP_1VAR_2IN_1OUT(longlong1, /) -DECLOP_1VAR_2IN_1OUT(longlong1, %) -DECLOP_1VAR_2IN_1OUT(longlong1, &) -DECLOP_1VAR_2IN_1OUT(longlong1, |) -DECLOP_1VAR_2IN_1OUT(longlong1, ^) -DECLOP_1VAR_2IN_1OUT(longlong1, <<) -DECLOP_1VAR_2IN_1OUT(longlong1, >>) - - -DECLOP_1VAR_ASSIGN(longlong1, +=) -DECLOP_1VAR_ASSIGN(longlong1, -=) -DECLOP_1VAR_ASSIGN(longlong1, *=) -DECLOP_1VAR_ASSIGN(longlong1, /=) -DECLOP_1VAR_ASSIGN(longlong1, %=) -DECLOP_1VAR_ASSIGN(longlong1, &=) -DECLOP_1VAR_ASSIGN(longlong1, |=) -DECLOP_1VAR_ASSIGN(longlong1, ^=) -DECLOP_1VAR_ASSIGN(longlong1, <<=) -DECLOP_1VAR_ASSIGN(longlong1, >>=) - -DECLOP_1VAR_PREOP(longlong1, ++) -DECLOP_1VAR_PREOP(longlong1, --) - -DECLOP_1VAR_POSTOP(longlong1, ++) -DECLOP_1VAR_POSTOP(longlong1, --) - -DECLOP_1VAR_COMP(longlong1, ==) -DECLOP_1VAR_COMP(longlong1, !=) -DECLOP_1VAR_COMP(longlong1, <) -DECLOP_1VAR_COMP(longlong1, >) -DECLOP_1VAR_COMP(longlong1, <=) -DECLOP_1VAR_COMP(longlong1, >=) - -DECLOP_1VAR_COMP(longlong1, &&) -DECLOP_1VAR_COMP(longlong1, ||) - -DECLOP_1VAR_1IN_1OUT(longlong1, ~) -DECLOP_1VAR_1IN_BOOLOUT(longlong1, !) - -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned char) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed char) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned short) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed short) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned int) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed int) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, float) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, double) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long long) -DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long long) - -// SIGNED LONGLONG2 - -DECLOP_2VAR_2IN_1OUT(longlong2, +) -DECLOP_2VAR_2IN_1OUT(longlong2, -) -DECLOP_2VAR_2IN_1OUT(longlong2, *) -DECLOP_2VAR_2IN_1OUT(longlong2, /) -DECLOP_2VAR_2IN_1OUT(longlong2, %) -DECLOP_2VAR_2IN_1OUT(longlong2, &) -DECLOP_2VAR_2IN_1OUT(longlong2, |) -DECLOP_2VAR_2IN_1OUT(longlong2, ^) -DECLOP_2VAR_2IN_1OUT(longlong2, <<) -DECLOP_2VAR_2IN_1OUT(longlong2, >>) - -DECLOP_2VAR_ASSIGN(longlong2, +=) -DECLOP_2VAR_ASSIGN(longlong2, -=) -DECLOP_2VAR_ASSIGN(longlong2, *=) -DECLOP_2VAR_ASSIGN(longlong2, /=) -DECLOP_2VAR_ASSIGN(longlong2, %=) -DECLOP_2VAR_ASSIGN(longlong2, &=) -DECLOP_2VAR_ASSIGN(longlong2, |=) -DECLOP_2VAR_ASSIGN(longlong2, ^=) -DECLOP_2VAR_ASSIGN(longlong2, <<=) -DECLOP_2VAR_ASSIGN(longlong2, >>=) - -DECLOP_2VAR_PREOP(longlong2, ++) -DECLOP_2VAR_PREOP(longlong2, --) - -DECLOP_2VAR_POSTOP(longlong2, ++) -DECLOP_2VAR_POSTOP(longlong2, --) - -DECLOP_2VAR_COMP(longlong2, ==) -DECLOP_2VAR_COMP(longlong2, !=) -DECLOP_2VAR_COMP(longlong2, <) -DECLOP_2VAR_COMP(longlong2, >) -DECLOP_2VAR_COMP(longlong2, <=) -DECLOP_2VAR_COMP(longlong2, >=) - -DECLOP_2VAR_COMP(longlong2, &&) -DECLOP_2VAR_COMP(longlong2, ||) - -DECLOP_2VAR_1IN_1OUT(longlong2, ~) -DECLOP_2VAR_1IN_BOOLOUT(longlong2, !) - -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned char) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed char) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned short) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed short) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned int) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed int) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, float) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, double) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long long) -DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long long) - -// SIGNED LONGLONG3 - -DECLOP_3VAR_2IN_1OUT(longlong3, +) -DECLOP_3VAR_2IN_1OUT(longlong3, -) -DECLOP_3VAR_2IN_1OUT(longlong3, *) -DECLOP_3VAR_2IN_1OUT(longlong3, /) -DECLOP_3VAR_2IN_1OUT(longlong3, %) -DECLOP_3VAR_2IN_1OUT(longlong3, &) -DECLOP_3VAR_2IN_1OUT(longlong3, |) -DECLOP_3VAR_2IN_1OUT(longlong3, ^) -DECLOP_3VAR_2IN_1OUT(longlong3, <<) -DECLOP_3VAR_2IN_1OUT(longlong3, >>) - -DECLOP_3VAR_ASSIGN(longlong3, +=) -DECLOP_3VAR_ASSIGN(longlong3, -=) -DECLOP_3VAR_ASSIGN(longlong3, *=) -DECLOP_3VAR_ASSIGN(longlong3, /=) -DECLOP_3VAR_ASSIGN(longlong3, %=) -DECLOP_3VAR_ASSIGN(longlong3, &=) -DECLOP_3VAR_ASSIGN(longlong3, |=) -DECLOP_3VAR_ASSIGN(longlong3, ^=) -DECLOP_3VAR_ASSIGN(longlong3, <<=) -DECLOP_3VAR_ASSIGN(longlong3, >>=) - -DECLOP_3VAR_PREOP(longlong3, ++) -DECLOP_3VAR_PREOP(longlong3, --) - -DECLOP_3VAR_POSTOP(longlong3, ++) -DECLOP_3VAR_POSTOP(longlong3, --) - -DECLOP_3VAR_COMP(longlong3, ==) -DECLOP_3VAR_COMP(longlong3, !=) -DECLOP_3VAR_COMP(longlong3, <) -DECLOP_3VAR_COMP(longlong3, >) -DECLOP_3VAR_COMP(longlong3, <=) -DECLOP_3VAR_COMP(longlong3, >=) - -DECLOP_3VAR_COMP(longlong3, &&) -DECLOP_3VAR_COMP(longlong3, ||) - -DECLOP_3VAR_1IN_1OUT(longlong3, ~) -DECLOP_3VAR_1IN_BOOLOUT(longlong3, !) - -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned char) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed char) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned short) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed short) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned int) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed int) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, float) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, double) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long long) -DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long long) - -// SIGNED LONGLONG4 - -DECLOP_4VAR_2IN_1OUT(longlong4, +) -DECLOP_4VAR_2IN_1OUT(longlong4, -) -DECLOP_4VAR_2IN_1OUT(longlong4, *) -DECLOP_4VAR_2IN_1OUT(longlong4, /) -DECLOP_4VAR_2IN_1OUT(longlong4, %) -DECLOP_4VAR_2IN_1OUT(longlong4, &) -DECLOP_4VAR_2IN_1OUT(longlong4, |) -DECLOP_4VAR_2IN_1OUT(longlong4, ^) -DECLOP_4VAR_2IN_1OUT(longlong4, <<) -DECLOP_4VAR_2IN_1OUT(longlong4, >>) - -DECLOP_4VAR_ASSIGN(longlong4, +=) -DECLOP_4VAR_ASSIGN(longlong4, -=) -DECLOP_4VAR_ASSIGN(longlong4, *=) -DECLOP_4VAR_ASSIGN(longlong4, /=) -DECLOP_4VAR_ASSIGN(longlong4, %=) -DECLOP_4VAR_ASSIGN(longlong4, &=) -DECLOP_4VAR_ASSIGN(longlong4, |=) -DECLOP_4VAR_ASSIGN(longlong4, ^=) -DECLOP_4VAR_ASSIGN(longlong4, <<=) -DECLOP_4VAR_ASSIGN(longlong4, >>=) - -DECLOP_4VAR_PREOP(longlong4, ++) -DECLOP_4VAR_PREOP(longlong4, --) - -DECLOP_4VAR_POSTOP(longlong4, ++) -DECLOP_4VAR_POSTOP(longlong4, --) - -DECLOP_4VAR_COMP(longlong4, ==) -DECLOP_4VAR_COMP(longlong4, !=) -DECLOP_4VAR_COMP(longlong4, <) -DECLOP_4VAR_COMP(longlong4, >) -DECLOP_4VAR_COMP(longlong4, <=) -DECLOP_4VAR_COMP(longlong4, >=) - -DECLOP_4VAR_COMP(longlong4, &&) -DECLOP_4VAR_COMP(longlong4, ||) - -DECLOP_4VAR_1IN_1OUT(longlong4, ~) -DECLOP_4VAR_1IN_BOOLOUT(longlong4, !) - -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned char) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed char) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned short) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed short) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned int) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed int) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, float) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, double) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long long) -DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long long) #endif + #endif diff --git a/projects/clr/hipamd/src/hip_fp16.cpp b/projects/clr/hipamd/src/hip_fp16.cpp index c2b7b47597..8e8f003f56 100644 --- a/projects/clr/hipamd/src/hip_fp16.cpp +++ b/projects/clr/hipamd/src/hip_fp16.cpp @@ -90,11 +90,11 @@ __device__ bool __hgt(__half a, __half b) { } __device__ bool __hisinf(__half a) { - return a == __hInfValue.h ? true : false; + return a == HINF ? true : false; } __device__ bool __hisnan(__half a) { - return a > __hInfValue.h ? true : false; + return a > HINF ? true : false; } __device__ bool __hle(__half a, __half b) { @@ -114,75 +114,75 @@ Half2 Comparision Functions */ __device__ bool __hbeq2(__half2 a, __half2 b) { - return (a.p[0] == b.p[0] ? true : false) && (a.p[1] == b.p[1] ? true : false); + return (a.x == b.x ? true : false) && (a.y == b.y ? true : false); } __device__ bool __hbge2(__half2 a, __half2 b) { - return (a.p[0] >= b.p[0] ? true : false) && (a.p[1] >= b.p[1] ? true : false); + return (a.x >= b.x ? true : false) && (a.y >= b.y ? true : false); } __device__ bool __hbgt2(__half2 a, __half2 b) { - return (a.p[0] > b.p[0] ? true : false) && (a.p[1] > b.p[1] ? true : false); + return (a.x > b.x ? true : false) && (a.y > b.y ? true : false); } __device__ bool __hble2(__half2 a, __half2 b) { - return (a.p[0] <= b.p[0] ? true : false) && (a.p[1] <= b.p[1] ? true : false); + return (a.x <= b.x ? true : false) && (a.y <= b.y ? true : false); } __device__ bool __hblt2(__half2 a, __half2 b) { - return (a.p[0] < b.p[0] ? true : false) && (a.p[1] < b.p[1] ? true : false); + return (a.x < b.x ? true : false) && (a.y < b.y ? true : false); } __device__ bool __hbne2(__half2 a, __half2 b) { - return (a.p[0] != b.p[0] ? true : false) && (a.p[1] != b.p[1] ? true : false); + return (a.x != b.x ? true : false) && (a.y != b.y ? true : false); } __device__ __half2 __heq2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] == b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] == b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x == b.x) ? (__half)1 : (__half)0; + c.y = (a.y == b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hge2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] >= b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] >= b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x >= b.x) ? (__half)1 : (__half)0; + c.y = (a.y >= b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hgt2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] > b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] > b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x > b.x) ? (__half)1 : (__half)0; + c.y = (a.y > b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hisnan2(__half2 a) { __half2 c; - c.p[0] = (a.p[0] > __hInfValue.h) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] > __hInfValue.h) ? (__half)1 : (__half)0; + c.x = (a.x > HINF) ? (__half)1 : (__half)0; + c.y = (a.y > HINF) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hle2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] <= b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] <= b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x <= b.x) ? (__half)1 : (__half)0; + c.y = (a.y <= b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hlt2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] < b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] < b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x < b.x) ? (__half)1 : (__half)0; + c.y = (a.y < b.y) ? (__half)1 : (__half)0; return c; } __device__ __half2 __hne2(__half2 a, __half2 b) { __half2 c; - c.p[0] = (a.p[0] != b.p[0]) ? (__half)1 : (__half)0; - c.p[1] = (a.p[1] != b.p[1]) ? (__half)1 : (__half)0; + c.x = (a.x != b.x) ? (__half)1 : (__half)0; + c.y = (a.y != b.y) ? (__half)1 : (__half)0; return c; } @@ -191,8 +191,8 @@ Conversion instructions */ __device__ __half2 __float22half2_rn(const float2 a) { __half2 b; - b.p[0] = (__half)a.x; - b.p[1] = (__half)a.y; + b.x = (__half)a.x; + b.y = (__half)a.y; return b; } @@ -202,8 +202,8 @@ __device__ __half __float2half(const float a) { __device__ __half2 __float2half2_rn(const float a) { __half2 b; - b.p[0] = (__half)a; - b.p[1] = (__half)a; + b.x = (__half)a; + b.y = (__half)a; return b; } @@ -225,15 +225,15 @@ __device__ __half __float2half_rz(const float a) { __device__ __half2 __floats2half2_rn(const float a, const float b) { __half2 c; - c.p[0] = (__half)a; - c.p[1] = (__half)b; + c.x = (__half)a; + c.y = (__half)b; return c; } __device__ float2 __half22float2(const __half2 a) { float2 b; - b.x = (float)a.p[0]; - b.y = (float)a.p[1]; + b.x = (float)a.x; + b.y = (float)a.y; return b; } @@ -243,8 +243,8 @@ __device__ float __half2float(const __half a) { __device__ __half2 half2half2(const __half a) { __half2 b; - b.p[0] = a; - b.p[1] = a; + b.x = a; + b.y = a; return b; } @@ -358,30 +358,30 @@ __device__ unsigned short int __half_as_ushort(const __half h) { __device__ __half2 __halves2half2(const __half a, const __half b) { __half2 c; - c.p[0] = a; - c.p[1] = b; + c.x = a; + c.y = b; return c; } __device__ float __high2float(const __half2 a) { - return (float)a.p[1]; + return (float)a.y; } __device__ __half __high2half(const __half2 a) { - return a.p[1]; + return a.y; } __device__ __half2 __high2half2(const __half2 a) { __half2 b; - b.p[0] = a.p[1]; - b.p[1] = a.p[1]; + b.x = a.y; + b.y = a.y; return b; } __device__ __half2 __highs2half2(const __half2 a, const __half2 b) { __half2 c; - c.p[0] = a.p[1]; - c.p[1] = b.p[1]; + c.x = a.y; + c.y = b.y; return c; } @@ -418,38 +418,38 @@ __device__ __half __ll2half_rz(long long int i){ } __device__ float __low2float(const __half2 a) { - return (float)a.p[0]; + return (float)a.x; } __device__ __half __low2half(const __half2 a) { - return a.p[0]; + return a.x; } __device__ __half2 __low2half2(const __half2 a, const __half2 b) { __half2 c; - c.p[0] = a.p[0]; - c.p[1] = b.p[0]; + c.x = a.x; + c.y = b.x; return c; } __device__ __half2 __low2half2(const __half2 a) { __half2 b; - b.p[0] = a.p[0]; - b.p[1] = a.p[0]; + b.x = a.x; + b.y = a.x; return b; } __device__ __half2 __lowhigh2highlow(const __half2 a) { __half2 b; - b.p[0] = a.p[1]; - b.p[1] = a.p[0]; + b.x = a.y; + b.y = a.x; return b; } __device__ __half2 __lows2half2(const __half2 a, const __half2 b) { __half2 c; - c.p[0] = a.p[0]; - c.p[1] = b.p[0]; + c.y = a.x; + c.y = b.x; return c; } @@ -542,346 +542,4 @@ typedef struct{ }; } struct_float; -#if __clang_major__ == 3 -static __device__ float cvt_half_to_float(__half a){ - struct_float ret = {0}; - if(a.x == 0){ - return 0.0f; - } - if(a.x == 0x8000){ - return -0.0f; - } - ret.u = ((a.x&0x8000)<<16) | (((a.x&0x7c00)+0x1C000)<<13) | ((a.x&0x03FF)<<13); - return ret.f; -} - -static __device__ __half cvt_float_to_half(float b){ - struct_float f = {0}; - __half ret = {0}; - f.f = b; - if(f.f == 0.0f){ - ret.x = 0; - return ret; - } - if(f.f == -0.0f){ - ret.x = 0x8000; - return ret; - } - ret.x = ((f.u>>16)&0x8000)|((((f.u&0x7f800000)-0x38000000)>>13)&0x7c00)|((f.u>>13)&0x03ff); - return ret; -} - - -__device__ __half __soft_hadd(const __half a, const __half b){ - return cvt_float_to_half(cvt_half_to_float(a)+cvt_half_to_float(b)); -} - -__device__ __half __soft_hadd_sat(const __half a, const __half b){ - float f = cvt_half_to_float(a) + cvt_half_to_float(b); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - -__device__ __half __soft_hfma(const __half a, const __half b, const __half c){ - return cvt_float_to_half(fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c))); -} - -__device__ __half __soft_hfma_sat(const __half a, const __half b, const __half c){ - float f = fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c)); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - -__device__ __half __soft_hmul(const __half a, const __half b){ - return cvt_float_to_half(cvt_half_to_float(a)*cvt_half_to_float(b)); -} - -__device__ __half __soft_hmul_sat(const __half a, const __half b){ - float f = cvt_half_to_float(a) * cvt_half_to_float(b); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - -__device__ __half __soft_hneq(const __half a){ - __half ret = {a.x}; - ret.x ^= 1 << 15; - return ret; -} - -__device__ __half __soft_hsub(const __half a, const __half b){ - return cvt_float_to_half(cvt_half_to_float(a)-cvt_half_to_float(b)); -} - -__device__ __half __soft_hsub_sat(const __half a, const __half b){ - float f = cvt_half_to_float(a) - cvt_half_to_float(b); - return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f))); -} - - -/* -Half2 Arithmetic Instructions -*/ - -__device__ __half2 __soft_hadd2(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hadd(a.p[1], b.p[1]); - ret.p[0] = __soft_hadd(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hadd2_sat(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hadd_sat(a.p[1], b.p[1]); - ret.p[0] = __soft_hadd_sat(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hfma2(const __half2 a, const __half2 b, const __half2 c){ - __half2 ret; - ret.p[1] = __soft_hfma(a.p[1], b.p[1], c.p[1]); - ret.p[0] = __soft_hfma(a.p[0], b.p[0], c.p[0]); - return ret; -} - -__device__ __half2 __soft_hfma2_sat(const __half2 a, const __half2 b, const __half2 c){ - __half2 ret; - ret.p[1] = __soft_hfma_sat(a.p[1], b.p[1], c.p[1]); - ret.p[0] = __soft_hfma_sat(a.p[0], b.p[0], c.p[0]); - return ret; -} - -__device__ __half2 __soft_hmul2(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hmul(a.p[1], b.p[1]); - ret.p[0] = __soft_hmul(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hmul2_sat(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hmul_sat(a.p[1], b.p[1]); - ret.p[0] = __soft_hmul_sat(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hneq2(const __half2 a){ - __half2 ret; - ret.p[1] = __soft_hneq(a.p[1]); - ret.p[0] = __soft_hneq(a.p[0]); - return ret; -} - -__device__ __half2 __soft_hsub2(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hsub(a.p[1], b.p[1]); - ret.p[0] = __soft_hsub(a.p[0], b.p[0]); - return ret; -} - -__device__ __half2 __soft_hsub2_sat(const __half2 a, const __half2 b){ - __half2 ret; - ret.p[1] = __soft_hsub_sat(a.p[1], b.p[1]); - ret.p[0] = __soft_hsub_sat(a.p[0], b.p[0]); - return ret; -} - -/* -Half Cmps -*/ - -__device__ bool __soft_heq(const __half a, const __half b){ - return (a.x == b.x ? true:false); -} - -__device__ bool __soft_hge(const __half a, const __half b){ - return (cvt_half_to_float(a) >= cvt_half_to_float(b)); -} - -__device__ bool __soft_hgt(const __half a, const __half b){ - return (cvt_half_to_float(a) > cvt_half_to_float(b)); -} - -__device__ bool __soft_hisinf(const __half a){ - return ((a.x == __half_neg_inf) ? -1 : (a.x == __half_pos_inf) ? 1 : 0); -} - -__device__ bool __soft_hisnan(const __half a){ - if(((a.x & __half_pos_inf) == a.x) || ((a.x & __half_neg_inf) == a.x)){ - return true; - }else{ - return false; - } -} - -__device__ bool __soft_hle(const __half a, const __half b){ - return (cvt_half_to_float(a) <= cvt_half_to_float(b)); -} - -__device__ bool __soft_hlt(const __half a, const __half b){ - return (cvt_half_to_float(a) < cvt_half_to_float(b)); -} - -__device__ bool __soft_hne(const __half a, const __half b){ - return a.x == b.x ? false : true; -} - -/* -Half2 Cmps -*/ - -__device__ bool __soft_hbeq2(const __half2 a, const __half2 b){ - return __soft_heq(a.p[1], b.p[1]) && __soft_heq(a.p[0], b.p[0]); -} - -__device__ bool __soft_hbge2(const __half2 a, const __half2 b){ - return __soft_hge(a.p[1], b.p[1]) && __soft_hge(a.p[0], b.p[0]); -} - -__device__ bool __soft_hbgt2(const __half2 a, const __half2 b){ - return __soft_hgt(a.p[1], b.p[1]) && __soft_hgt(a.p[0], b.p[0]); -} - -__device__ bool __soft_hble2(const __half2 a, const __half2 b){ - return __soft_hle(a.p[1], b.p[1]) && __soft_hle(a.p[0], b.p[0]); -} - -__device__ bool __soft_hblt2(const __half2 a, const __half2 b){ - return __soft_hlt(a.p[1], b.p[1]) && __soft_hlt(a.p[0], b.p[0]); -} - -__device__ bool __soft_hbne2(const __half2 a, const __half2 b){ - return __soft_hne(a.p[1], b.p[1]) && __soft_hne(a.p[0], b.p[0]); -} - - - -__device__ __half2 __soft_heq2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_heq(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_heq(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hge2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hge(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hge(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hgt2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hgt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hgt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hisnan2(const __half2 a){ - __half2 ret = {0}; - ret.p[1] = __soft_hisnan(a.p[1]) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = __soft_hisnan(a.p[0]) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hle2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hle(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hle(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hlt2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hlt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hlt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -__device__ __half2 __soft_hne2(const __half2 a, const __half2 b){ - __half2 ret = {0}; - ret.p[1] = (__soft_hne(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float; - ret.p[0] = (__soft_hne(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float; - return ret; -} - -/* -Half Cnvs and Data Mvmnt -*/ - -__device__ __half2 __soft_float22half2_rn(const float2 a){ - __half2 ret = {0}; - ret.p[1] = cvt_float_to_half(a.x); - ret.p[0] = cvt_float_to_half(a.y); - return ret; -} - -__device__ __half __soft_float2half(const float a){ - return cvt_float_to_half(a); -} - -__device__ __half2 __soft_float2half2_rn(const float a){ - __half ret = cvt_float_to_half(a); - return {ret, ret}; -} - -__device__ __half2 __soft_floats2half2_rn(const float a, const float b){ - return {cvt_float_to_half(a), cvt_float_to_half(b)}; -} - -__device__ float2 __soft_half22float2(const __half2 a){ - return {cvt_half_to_float(a.p[1]), cvt_half_to_float(a.p[0])}; -} - -__device__ float __soft_half2float(const __half a){ - return cvt_half_to_float(a); -} - -__device__ __half2 __soft_half2half2(const __half a){ - return {a,a}; -} - -__device__ __half2 __soft_halves2half2(const __half a, const __half b){ - return {a,b}; -} - -__device__ float __soft_high2float(const __half2 a){ - return cvt_half_to_float(a.p[1]); -} - -__device__ __half __soft_high2half(const __half2 a){ - return a.p[1]; -} - -__device__ __half2 __soft_high2half2(const __half2 a){ - return {a.p[1], a.p[1]}; -} - -__device__ __half2 __soft_highs2half2(const __half2 a, const __half2 b){ - return {a.p[1], b.p[1]}; -} - -__device__ float __soft_low2float(const __half2 a){ - return cvt_half_to_float(a.p[0]); -} - -__device__ __half __soft_low2half(const __half2 a){ - return a.p[0]; -} - -__device__ __half2 __soft_low2half2(const __half2 a){ - return {a.p[0], a.p[0]}; -} - -__device__ __half2 __soft_lows2half2(const __half2 a, const __half2 b){ - return {a.p[0], b.p[0]}; -} - -__device__ __half2 __soft_lowhigh2highlow(const __half2 a){ - return {a.p[0], a.p[1]}; -} - -__device__ __half2 __soft_low2half2(const __half2 a, const __half2 b){ - return {a.p[0], b.p[0]}; -} - - - -#endif diff --git a/projects/clr/hipamd/src/hip_hc_gfx803.ll b/projects/clr/hipamd/src/hip_hc_gfx803.ll index 0080fc7d81..7e3d0e37dd 100644 --- a/projects/clr/hipamd/src/hip_hc_gfx803.ll +++ b/projects/clr/hipamd/src/hip_hc_gfx803.ll @@ -2,89 +2,122 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64: target triple = "amdgcn--amdhsa" -define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hadd2_int(<2 x half> %a, <2 x half> %b) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2) + tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2) + %4 = bitcast i32 %3 to <2 x half> + ret <2 x half> %4 } -define i32 @__hip_hc_ir_hfma2_int(i32 %a, i32 %b, i32 %c) #1 { - %1 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c) - tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %1, i32 %c) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hfma2_int(<2 x half> %a, <2 x half> %b, <2 x half> %c) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = bitcast <2 x half> %c to i32 + %4 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %1, i32 %2, i32 %3) + tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %4, i32 %1, i32 %2) + tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %4, i32 %4, i32 %3) + %5 = bitcast i32 %4 to <2 x half> + ret <2 x half> %5 } -define i32 @__hip_hc_ir_hmul2_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hmul2_int(<2 x half> %a, <2 x half> %b) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2) + tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2) + %4 = bitcast i32 %3 to <2 x half> + ret <2 x half> %4 } -define i32 @__hip_hc_ir_hsub2_int(i32 %a, i32 %b) #1 { - %1 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b) - tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b) - ret i32 %1 +define <2 x half> @__hip_hc_ir_hsub2_int(<2 x half> %a, <2 x half> %b) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = bitcast <2 x half> %b to i32 + %3 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %1, i32 %2) + tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %3, i32 %1, i32 %2) + %4 = bitcast i32 %3 to <2 x half> + ret <2 x half> %4 } -define i32 @__hip_hc_ir_h2ceil_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_ceil_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_ceil_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2ceil_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_ceil_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_ceil_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2cos_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_cos_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_cos_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2cos_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_cos_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_cos_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2exp2_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_exp_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_exp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2exp2_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_exp_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_exp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2floor_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_floor_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_floor_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2floor_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_floor_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_floor_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2log2_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_log_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_log_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2log2_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_log_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_log_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2rcp_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_rcp_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_rcp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2rcp_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_rcp_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_rcp_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2rsqrt_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_rsq_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_rsq_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2rsqrt_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_rsq_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_rsq_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2sin_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_sin_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_sin_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2sin_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_sin_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_sin_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2sqrt_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_sqrt_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_sqrt_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2sqrt_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_sqrt_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_sqrt_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } -define i32 @__hip_hc_ir_h2trunc_int(i32 %a) #1 { - %1 = tail call i32 asm sideeffect "v_trunc_f16 $0, $1","=v,v"(i32 %a) - tail call void asm sideeffect "v_trunc_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %1, i32 %a) - ret i32 %1 +define <2 x half> @__hip_hc_ir_h2trunc_int(<2 x half> %a) #1 { + %1 = bitcast <2 x half> %a to i32 + %2 = tail call i32 asm sideeffect "v_trunc_f16 $0, $1","=v,v"(i32 %1) + tail call void asm sideeffect "v_trunc_f16_sdwa $0, $1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1","v,v"(i32 %2, i32 %1) + %3 = bitcast i32 %2 to <2 x half> + ret <2 x half> %3 } attributes #1 = { alwaysinline nounwind } From 8c50285d3037e1a95804a6d01a967ced35171da6 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 16 May 2017 21:59:14 -0500 Subject: [PATCH 091/171] Return precise address for hipHostGetDevicePointer. [ROCm/clr commit: ee37a31799cb92705f1161b4cd07e1276f7f7663] --- projects/clr/hipamd/src/hip_memory.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 5501fec734..fc2ada134e 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -202,7 +202,8 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, hostPointer); if (status == AM_SUCCESS) { - *devicePointer = amPointerInfo._devicePointer; + *devicePointer = static_cast(amPointerInfo._devicePointer) + (static_cast(hostPointer) - static_cast(amPointerInfo._hostPointer)) ; + tprintf(DB_MEM, " host_ptr=%p returned device_pointer=%p\n", hostPointer, *devicePointer); } else { e = hipErrorMemoryAllocation; } From 905ab9a6892b176998d911c494516e410e532c1f Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Thu, 18 May 2017 10:50:56 -0500 Subject: [PATCH 092/171] fixed vector type issues by reverting to old code, changed __half2 to map to vector types in llvm Change-Id: I7317408c25e8c1a0c02a346042c9137e160c8bbd [ROCm/clr commit: 0433a2e6083d9cb04154b125d518dab214c09c4c] --- .../hipamd/include/hip/hcc_detail/hip_fp16.h | 5 +- .../include/hip/hcc_detail/hip_vector_types.h | 4038 ++++++++++++++++- 2 files changed, 3978 insertions(+), 65 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h index f1f52e4122..a1abce2191 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h @@ -25,8 +25,9 @@ THE SOFTWARE. #include "hip/hcc_detail/hip_vector_types.h" -typedef __half half; -typedef __half2 half2; +typedef __fp16 __half; +typedef __fp16 __half1 __attribute__((ext_vector_type(1))); +typedef __fp16 __half2 __attribute__((ext_vector_type(2))); /* Half Arithmetic Functions diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h index 251da504ab..3c3b26c12a 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h @@ -34,93 +34,1120 @@ THE SOFTWARE. #include "hip/hcc_detail/host_defines.h" -#if __cplusplus +#define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x) { } \ +__device__ __host__ type(const type& val) : x(val.x) { } -typedef unsigned char uchar1 __attribute__((ext_vector_type(1))); -typedef unsigned char uchar2 __attribute__((ext_vector_type(2))); -typedef unsigned char uchar3 __attribute__((ext_vector_type(3))); -typedef unsigned char uchar4 __attribute__((ext_vector_type(4))); +#define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ +__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } -typedef signed char char1 __attribute__((ext_vector_type(1))); -typedef signed char char2 __attribute__((ext_vector_type(2))); -typedef signed char char3 __attribute__((ext_vector_type(3))); -typedef signed char char4 __attribute__((ext_vector_type(4))); +#define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } -typedef unsigned short ushort1 __attribute__((ext_vector_type(1))); -typedef unsigned short ushort2 __attribute__((ext_vector_type(2))); -typedef unsigned short ushort3 __attribute__((ext_vector_type(3))); -typedef unsigned short ushort4 __attribute__((ext_vector_type(4))); +#define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ +__device__ __host__ type() {} \ +__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } -typedef signed short short1 __attribute__((ext_vector_type(1))); -typedef signed short short2 __attribute__((ext_vector_type(2))); -typedef signed short short3 __attribute__((ext_vector_type(3))); -typedef signed short short4 __attribute__((ext_vector_type(4))); -typedef __fp16 __half; +#define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val) {} \ -typedef __fp16 __half1 __attribute__((ext_vector_type(1))); -typedef __fp16 __half2 __attribute__((ext_vector_type(2))); -typedef __fp16 __half3 __attribute__((ext_vector_type(3))); -typedef __fp16 __half4 __attribute__((ext_vector_type(4))); +#define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val), y(val) {} \ +__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} -typedef unsigned int uint1 __attribute__((ext_vector_type(1))); -typedef unsigned int uint2 __attribute__((ext_vector_type(2))); -typedef unsigned int uint3 __attribute__((ext_vector_type(3))); -typedef unsigned int uint4 __attribute__((ext_vector_type(4))); +#define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \ +__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} -typedef signed int int1 __attribute__((ext_vector_type(1))); -typedef signed int int2 __attribute__((ext_vector_type(2))); -typedef signed int int3 __attribute__((ext_vector_type(3))); -typedef signed int int4 __attribute__((ext_vector_type(4))); +#define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \ +__device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \ +__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} -typedef float float1 __attribute__((ext_vector_type(1))); -typedef float float2 __attribute__((ext_vector_type(2))); -typedef float float3 __attribute__((ext_vector_type(3))); -typedef float float4 __attribute__((ext_vector_type(4))); +struct uchar1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uchar1) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long long) -typedef unsigned long ulong1 __attribute__((ext_vector_type(1))); -typedef unsigned long ulong2 __attribute__((ext_vector_type(2))); -typedef unsigned long ulong3 __attribute__((ext_vector_type(3))); -typedef unsigned long ulong4 __attribute__((ext_vector_type(4))); + #endif + unsigned char x; -typedef signed long long1 __attribute__((ext_vector_type(1))); -typedef signed long long2 __attribute__((ext_vector_type(2))); -typedef signed long long3 __attribute__((ext_vector_type(3))); -typedef signed long long4 __attribute__((ext_vector_type(4))); +} __attribute__((aligned(1))); -typedef double double1 __attribute__((ext_vector_type(1))); -typedef double double2 __attribute__((ext_vector_type(2))); -typedef double double3 __attribute__((ext_vector_type(3))); -typedef double double4 __attribute__((ext_vector_type(4))); +struct uchar2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uchar2) -typedef unsigned long long ulonglong1 __attribute__((ext_vector_type(1))); -typedef unsigned long long ulonglong2 __attribute__((ext_vector_type(2))); -typedef unsigned long long ulonglong3 __attribute__((ext_vector_type(3))); -typedef unsigned long long ulonglong4 __attribute__((ext_vector_type(4))); + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long long) + #endif + union { + struct { + unsigned char x, y; + }; + unsigned short a; + }; +} __attribute__((aligned(2))); -typedef signed long long longlong1 __attribute__((ext_vector_type(1))); -typedef signed long long longlong2 __attribute__((ext_vector_type(2))); -typedef signed long long longlong3 __attribute__((ext_vector_type(3))); -typedef signed long long longlong4 __attribute__((ext_vector_type(4))); +struct uchar3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uchar3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long long) + #endif + unsigned char x, y, z; +}; + +struct uchar4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uchar4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long long) + #endif + union { + struct { + unsigned char x, y, z, w; + }; + unsigned int a; + }; +} __attribute__((aligned(4))); + + +struct char1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(char1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long long) + #endif + signed char x; +} __attribute__((aligned(1))); + +struct char2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(char2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long long) + #endif + union { + struct { + signed char x, y; + }; + unsigned short a; + }; +} __attribute__((aligned(2))); + +struct char3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(char3) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long long) + #endif + signed char x, y, z; +}; + +struct char4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(char4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long long) + #endif + union { + struct { + signed char x, y, z, w; + }; + unsigned int a; + }; +} __attribute__((aligned(4))); + + + +struct ushort1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ushort1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long long) + #endif + unsigned short x; +} __attribute__((aligned(2))); + +struct ushort2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ushort2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long long) + #endif + union { + struct { + unsigned short x, y; + }; + unsigned int a; + }; +} __attribute__((aligned(4))); + +struct ushort3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ushort3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long long) + #endif + unsigned short x, y, z; +}; + +struct ushort4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ushort4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long long) + #endif + union { + struct { + unsigned short x, y, z, w; + }; + unsigned int a, b; + }; +} __attribute__((aligned(8))); + +struct short1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(short1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long long) + #endif + signed short x; +} __attribute__((aligned(2))); + +struct short2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(short2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long long) + #endif + union { + struct { + signed short x, y; + }; + unsigned int a; + }; + +} __attribute__((aligned(4))); + +struct short3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(short3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long long) + #endif + signed short x, y, z; +}; + +struct short4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(short4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long long) + #endif + union { + struct { + signed short x, y, z, w; + }; + unsigned int a, b; + }; +} __attribute__((aligned(8))); + + +struct uint1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uint1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long long) + #endif + unsigned int x; +} __attribute__((aligned(4))); + +struct uint2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uint2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long long) + #endif + unsigned int x, y; +} __attribute__((aligned(8))); + +struct uint3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uint3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long long) + #endif + unsigned int x, y, z; +}; + +struct uint4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uint4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long long) + #endif + unsigned int x, y, z, w; +} __attribute__((aligned(16))); + +struct int1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(int1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long long) + #endif + signed int x; +} __attribute__((aligned(4))); + +struct int2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(int2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long long) + #endif + signed int x, y; +} __attribute__((aligned(8))); + +struct int3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(int3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long long) + #endif + signed int x, y, z; +}; + +struct int4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(int4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long long) + #endif + signed int x, y, z, w; +} __attribute__((aligned(16))); + + +struct float1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(float1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long long) + #endif + float x; +} __attribute__((aligned(4))); + +struct float2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(float2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long long) + #endif + float x, y; +} __attribute__((aligned(8))); + +struct float3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(float3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long long) + #endif + float x, y, z; +}; + +struct float4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(float4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long long) + #endif + float x, y, z, w; +} __attribute__((aligned(16))); + + + +struct double1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(double1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long long) + #endif + double x; +} __attribute__((aligned(8))); + +struct double2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(double2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long long) + #endif + double x, y; +} __attribute__((aligned(16))); + +struct double3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(double3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long long) + #endif + double x, y, z; +}; + +struct double4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(double4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long long) + #endif + double x, y, z, w; +} __attribute__((aligned(32))); + + +struct ulong1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulong1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long long) + #endif + unsigned long x; +} __attribute__((aligned(8))); + +struct ulong2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulong2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long long) + #endif + unsigned long x, y; +} __attribute__((aligned(16))); + +struct ulong3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulong3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long long) + #endif + unsigned long x, y, z; +}; + +struct ulong4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulong4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long long) + #endif + unsigned long x, y, z, w; +} __attribute__((aligned(32))); + + +struct long1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(long1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long long) + #endif + signed long x; +} __attribute__((aligned(8))); + +struct long2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(long2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long long) + #endif + signed long x, y; +} __attribute__((aligned(16))); + +struct long3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(long3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long long) + #endif + signed long x, y, z; +}; + +struct long4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(long4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long long) + #endif + signed long x, y, z, w; +} __attribute__((aligned(32))); + + +struct ulonglong1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long long) + #endif + unsigned long long x; +} __attribute__((aligned(8))); + +struct ulonglong2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long long) + #endif + unsigned long long x, y; +} __attribute__((aligned(16))); + +struct ulonglong3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long long) + #endif + unsigned long long x, y, z; +}; + +struct ulonglong4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long long) + #endif + unsigned long long x, y, z, w; +} __attribute__((aligned(32))); + + +struct longlong1 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(longlong1) + + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed char) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed short) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed int) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, float) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, double) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long long) + #endif + signed long long x; +} __attribute__((aligned(8))); + +struct longlong2 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(longlong2) + + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed char) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed short) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed int) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, float) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, double) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long long) + #endif + signed long long x, y; +} __attribute__((aligned(16))); + +struct longlong3 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(longlong3) + + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed char) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed short) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed int) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, float) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, double) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long long) + #endif + signed long long x, y, z; +}; + +struct longlong4 { + #ifdef __cplusplus + public: + MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(longlong4) + + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed char) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed short) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed int) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, float) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, double) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long long) + MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long long) + #endif + signed long x, y, z, w; +} __attribute__((aligned(32))); #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x) { \ + struct type ret; \ ret.x = x; \ return ret; \ } #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x, comp y) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x, comp y) { \ + struct type ret; \ ret.x = x; \ ret.y = y; \ return ret; \ } #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x, comp y, comp z) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z) { \ + struct type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -128,8 +1155,8 @@ __device__ __host__ static inline type make_##type(comp x, comp y, comp z) { \ } #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \ -__device__ __host__ static inline type make_##type(comp x, comp y, comp z, comp w) { \ - type ret; \ +__device__ __host__ static inline struct type make_##type(comp x, comp y, comp z, comp w) { \ + struct type ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ @@ -137,7 +1164,6 @@ __device__ __host__ static inline type make_##type(comp x, comp y, comp z, comp return ret; \ } - DECLOP_MAKE_ONE_COMPONENT(unsigned char, uchar1); DECLOP_MAKE_TWO_COMPONENT(unsigned char, uchar2); DECLOP_MAKE_THREE_COMPONENT(unsigned char, uchar3); @@ -199,6 +1225,2892 @@ DECLOP_MAKE_THREE_COMPONENT(signed long, longlong3); DECLOP_MAKE_FOUR_COMPONENT(signed long, longlong4); +#if __cplusplus + +#define DECLOP_1VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + return ret; \ +} + +#define DECLOP_1VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + return ret; \ +} + +#define DECLOP_1VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + return lhs; \ +} + +#define DECLOP_1VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + return val; \ +} + +#define DECLOP_1VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + val.x op; \ + return ret; \ +} + +#define DECLOP_1VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return lhs.x op rhs.x; \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return lhs.x op rhs.x; \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return lhs.x op rhs.x ; \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return lhs.x op rhs.x ; \ +} + +#define DECLOP_1VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type& rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + return ret; \ +} + +#define DECLOP_1VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type& rhs) { \ + return op rhs.x; \ +} + +/* + Two Element Access +*/ + +#define DECLOP_2VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + ret.y = lhs.y op rhs.y; \ + return ret; \ +} + +#define DECLOP_2VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + ret.y = lhs.y * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + ret.y = lhs * rhs.y; \ + return ret; \ +} + +#define DECLOP_2VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + lhs.y op rhs.y; \ + return lhs; \ +} + +#define DECLOP_2VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + op val.y; \ + return val; \ +} + +#define DECLOP_2VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + ret.y = val.y; \ + val.x op; \ + val.y op; \ + return ret; \ +} + +#define DECLOP_2VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y); \ +} + +#define DECLOP_2VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type &rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + ret.y = op rhs.y; \ + return ret; \ +} + +#define DECLOP_2VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type &rhs) { \ + return (op rhs.x) && (op rhs.y); \ +} + + +/* + Three Element Access +*/ + +#define DECLOP_3VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op (const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + ret.y = lhs.y op rhs.y; \ + ret.z = lhs.z op rhs.z; \ + return ret; \ +} + +#define DECLOP_3VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + ret.y = lhs.y * rhs; \ + ret.z = lhs.z * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + ret.y = lhs * rhs.y; \ + ret.z = lhs * rhs.z; \ + return ret; \ +} + +#define DECLOP_3VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + lhs.y op rhs.y; \ + lhs.z op rhs.z; \ + return lhs; \ +} + +#define DECLOP_3VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + op val.y; \ + op val.z; \ + return val; \ +} + +#define DECLOP_3VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + ret.y = val.y; \ + ret.z = val.z; \ + val.x op; \ + val.y op; \ + val.z op; \ + return ret; \ +} + +#define DECLOP_3VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z); \ +} \ + +#define DECLOP_3VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type &rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + ret.y = op rhs.y; \ + ret.z = op rhs.z; \ + return ret; \ +} + +#define DECLOP_3VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type &rhs) { \ + return (op rhs.x) && (op rhs.y) && (op rhs.z); \ +} + + +/* + Four Element Access +*/ + +#define DECLOP_4VAR_2IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op ( const type& lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs.x op rhs.x; \ + ret.y = lhs.y op rhs.y; \ + ret.z = lhs.z op rhs.z; \ + ret.w = lhs.w op rhs.w; \ + return ret; \ +} + +#define DECLOP_4VAR_SCALE_PRODUCT(type, type1) \ +__device__ __host__ static inline type operator * (const type& lhs, type1 rhs) { \ + type ret; \ + ret.x = lhs.x * rhs; \ + ret.y = lhs.y * rhs; \ + ret.z = lhs.z * rhs; \ + ret.w = lhs.w * rhs; \ + return ret; \ +} \ +\ +__device__ __host__ static inline type operator * (type1 lhs, const type& rhs) { \ + type ret; \ + ret.x = lhs * rhs.x; \ + ret.y = lhs * rhs.y; \ + ret.z = lhs * rhs.z; \ + ret.w = lhs * rhs.w; \ + return ret; \ +} + +#define DECLOP_4VAR_ASSIGN(type, op) \ +__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \ + lhs.x op rhs.x; \ + lhs.y op rhs.y; \ + lhs.z op rhs.z; \ + lhs.w op rhs.w; \ + return lhs; \ +} + +#define DECLOP_4VAR_PREOP(type, op) \ +__device__ __host__ static inline type& operator op (type& val) { \ + op val.x; \ + op val.y; \ + op val.z; \ + op val.w; \ + return val; \ +} + +#define DECLOP_4VAR_POSTOP(type, op) \ +__device__ __host__ static inline type operator op (type& val, int) { \ + type ret; \ + ret.x = val.x; \ + ret.y = val.y; \ + ret.z = val.z; \ + ret.w = val.w; \ + val.x op; \ + val.y op; \ + val.z op; \ + val.w op; \ + return ret; \ +} + +#define DECLOP_4VAR_COMP(type, op) \ +__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} \ +__device__ __host__ static inline bool operator op (const type& lhs, const type& rhs) { \ + return (lhs.x op rhs.x) && (lhs.y op rhs.y) && (lhs.z op rhs.z) && (lhs.w op rhs.w); \ +} + +#define DECLOP_4VAR_1IN_1OUT(type, op) \ +__device__ __host__ static inline type operator op(type &rhs) { \ + type ret; \ + ret.x = op rhs.x; \ + ret.y = op rhs.y; \ + ret.z = op rhs.z; \ + ret.w = op rhs.w; \ + return ret; \ +} + +#define DECLOP_4VAR_1IN_BOOLOUT(type, op) \ +__device__ __host__ static inline bool operator op (type &rhs) { \ + return (op rhs.x) && (op rhs.y) && (op rhs.z) && (op rhs.w); \ +} + + +/* +Overloading operators +*/ + +// UNSIGNED CHAR1 + +DECLOP_1VAR_2IN_1OUT(uchar1, +) +DECLOP_1VAR_2IN_1OUT(uchar1, -) +DECLOP_1VAR_2IN_1OUT(uchar1, *) +DECLOP_1VAR_2IN_1OUT(uchar1, /) +DECLOP_1VAR_2IN_1OUT(uchar1, %) +DECLOP_1VAR_2IN_1OUT(uchar1, &) +DECLOP_1VAR_2IN_1OUT(uchar1, |) +DECLOP_1VAR_2IN_1OUT(uchar1, ^) +DECLOP_1VAR_2IN_1OUT(uchar1, <<) +DECLOP_1VAR_2IN_1OUT(uchar1, >>) + + +DECLOP_1VAR_ASSIGN(uchar1, +=) +DECLOP_1VAR_ASSIGN(uchar1, -=) +DECLOP_1VAR_ASSIGN(uchar1, *=) +DECLOP_1VAR_ASSIGN(uchar1, /=) +DECLOP_1VAR_ASSIGN(uchar1, %=) +DECLOP_1VAR_ASSIGN(uchar1, &=) +DECLOP_1VAR_ASSIGN(uchar1, |=) +DECLOP_1VAR_ASSIGN(uchar1, ^=) +DECLOP_1VAR_ASSIGN(uchar1, <<=) +DECLOP_1VAR_ASSIGN(uchar1, >>=) + +DECLOP_1VAR_PREOP(uchar1, ++) +DECLOP_1VAR_PREOP(uchar1, --) + +DECLOP_1VAR_POSTOP(uchar1, ++) +DECLOP_1VAR_POSTOP(uchar1, --) + +DECLOP_1VAR_COMP(uchar1, ==) +DECLOP_1VAR_COMP(uchar1, !=) +DECLOP_1VAR_COMP(uchar1, <) +DECLOP_1VAR_COMP(uchar1, >) +DECLOP_1VAR_COMP(uchar1, <=) +DECLOP_1VAR_COMP(uchar1, >=) + +DECLOP_1VAR_COMP(uchar1, &&) +DECLOP_1VAR_COMP(uchar1, ||) + +DECLOP_1VAR_1IN_1OUT(uchar1, ~) +DECLOP_1VAR_1IN_BOOLOUT(uchar1, !) + +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, float) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, double) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long long) + +// UNSIGNED CHAR2 + +DECLOP_2VAR_2IN_1OUT(uchar2, +) +DECLOP_2VAR_2IN_1OUT(uchar2, -) +DECLOP_2VAR_2IN_1OUT(uchar2, *) +DECLOP_2VAR_2IN_1OUT(uchar2, /) +DECLOP_2VAR_2IN_1OUT(uchar2, %) +DECLOP_2VAR_2IN_1OUT(uchar2, &) +DECLOP_2VAR_2IN_1OUT(uchar2, |) +DECLOP_2VAR_2IN_1OUT(uchar2, ^) +DECLOP_2VAR_2IN_1OUT(uchar2, <<) +DECLOP_2VAR_2IN_1OUT(uchar2, >>) + +DECLOP_2VAR_ASSIGN(uchar2, +=) +DECLOP_2VAR_ASSIGN(uchar2, -=) +DECLOP_2VAR_ASSIGN(uchar2, *=) +DECLOP_2VAR_ASSIGN(uchar2, /=) +DECLOP_2VAR_ASSIGN(uchar2, %=) +DECLOP_2VAR_ASSIGN(uchar2, &=) +DECLOP_2VAR_ASSIGN(uchar2, |=) +DECLOP_2VAR_ASSIGN(uchar2, ^=) +DECLOP_2VAR_ASSIGN(uchar2, <<=) +DECLOP_2VAR_ASSIGN(uchar2, >>=) + +DECLOP_2VAR_PREOP(uchar2, ++) +DECLOP_2VAR_PREOP(uchar2, --) + +DECLOP_2VAR_POSTOP(uchar2, ++) +DECLOP_2VAR_POSTOP(uchar2, --) + +DECLOP_2VAR_COMP(uchar2, ==) +DECLOP_2VAR_COMP(uchar2, !=) +DECLOP_2VAR_COMP(uchar2, <) +DECLOP_2VAR_COMP(uchar2, >) +DECLOP_2VAR_COMP(uchar2, <=) +DECLOP_2VAR_COMP(uchar2, >=) + +DECLOP_2VAR_COMP(uchar2, &&) +DECLOP_2VAR_COMP(uchar2, ||) + +DECLOP_2VAR_1IN_1OUT(uchar2, ~) +DECLOP_2VAR_1IN_BOOLOUT(uchar2, !) + +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, float) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, double) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long long) + +// UNSIGNED CHAR3 + +DECLOP_3VAR_2IN_1OUT(uchar3, +) +DECLOP_3VAR_2IN_1OUT(uchar3, -) +DECLOP_3VAR_2IN_1OUT(uchar3, *) +DECLOP_3VAR_2IN_1OUT(uchar3, /) +DECLOP_3VAR_2IN_1OUT(uchar3, %) +DECLOP_3VAR_2IN_1OUT(uchar3, &) +DECLOP_3VAR_2IN_1OUT(uchar3, |) +DECLOP_3VAR_2IN_1OUT(uchar3, ^) +DECLOP_3VAR_2IN_1OUT(uchar3, <<) +DECLOP_3VAR_2IN_1OUT(uchar3, >>) + +DECLOP_3VAR_ASSIGN(uchar3, +=) +DECLOP_3VAR_ASSIGN(uchar3, -=) +DECLOP_3VAR_ASSIGN(uchar3, *=) +DECLOP_3VAR_ASSIGN(uchar3, /=) +DECLOP_3VAR_ASSIGN(uchar3, %=) +DECLOP_3VAR_ASSIGN(uchar3, &=) +DECLOP_3VAR_ASSIGN(uchar3, |=) +DECLOP_3VAR_ASSIGN(uchar3, ^=) +DECLOP_3VAR_ASSIGN(uchar3, <<=) +DECLOP_3VAR_ASSIGN(uchar3, >>=) + +DECLOP_3VAR_PREOP(uchar3, ++) +DECLOP_3VAR_PREOP(uchar3, --) + +DECLOP_3VAR_POSTOP(uchar3, ++) +DECLOP_3VAR_POSTOP(uchar3, --) + +DECLOP_3VAR_COMP(uchar3, ==) +DECLOP_3VAR_COMP(uchar3, !=) +DECLOP_3VAR_COMP(uchar3, <) +DECLOP_3VAR_COMP(uchar3, >) +DECLOP_3VAR_COMP(uchar3, <=) +DECLOP_3VAR_COMP(uchar3, >=) + +DECLOP_3VAR_COMP(uchar3, &&) +DECLOP_3VAR_COMP(uchar3, ||) + +DECLOP_3VAR_1IN_1OUT(uchar3, ~) +DECLOP_3VAR_1IN_BOOLOUT(uchar3, !) + +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, float) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, double) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long long) + +// UNSIGNED CHAR4 + +DECLOP_4VAR_2IN_1OUT(uchar4, +) +DECLOP_4VAR_2IN_1OUT(uchar4, -) +DECLOP_4VAR_2IN_1OUT(uchar4, *) +DECLOP_4VAR_2IN_1OUT(uchar4, /) +DECLOP_4VAR_2IN_1OUT(uchar4, %) +DECLOP_4VAR_2IN_1OUT(uchar4, &) +DECLOP_4VAR_2IN_1OUT(uchar4, |) +DECLOP_4VAR_2IN_1OUT(uchar4, ^) +DECLOP_4VAR_2IN_1OUT(uchar4, <<) +DECLOP_4VAR_2IN_1OUT(uchar4, >>) + +DECLOP_4VAR_ASSIGN(uchar4, +=) +DECLOP_4VAR_ASSIGN(uchar4, -=) +DECLOP_4VAR_ASSIGN(uchar4, *=) +DECLOP_4VAR_ASSIGN(uchar4, /=) +DECLOP_4VAR_ASSIGN(uchar4, %=) +DECLOP_4VAR_ASSIGN(uchar4, &=) +DECLOP_4VAR_ASSIGN(uchar4, |=) +DECLOP_4VAR_ASSIGN(uchar4, ^=) +DECLOP_4VAR_ASSIGN(uchar4, <<=) +DECLOP_4VAR_ASSIGN(uchar4, >>=) + +DECLOP_4VAR_PREOP(uchar4, ++) +DECLOP_4VAR_PREOP(uchar4, --) + +DECLOP_4VAR_POSTOP(uchar4, ++) +DECLOP_4VAR_POSTOP(uchar4, --) + +DECLOP_4VAR_COMP(uchar4, ==) +DECLOP_4VAR_COMP(uchar4, !=) +DECLOP_4VAR_COMP(uchar4, <) +DECLOP_4VAR_COMP(uchar4, >) +DECLOP_4VAR_COMP(uchar4, <=) +DECLOP_4VAR_COMP(uchar4, >=) + +DECLOP_4VAR_COMP(uchar4, &&) +DECLOP_4VAR_COMP(uchar4, ||) + +DECLOP_4VAR_1IN_1OUT(uchar4, ~) +DECLOP_4VAR_1IN_BOOLOUT(uchar4, !) + +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, float) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, double) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long long) + +// SIGNED CHAR1 + +DECLOP_1VAR_2IN_1OUT(char1, +) +DECLOP_1VAR_2IN_1OUT(char1, -) +DECLOP_1VAR_2IN_1OUT(char1, *) +DECLOP_1VAR_2IN_1OUT(char1, /) +DECLOP_1VAR_2IN_1OUT(char1, %) +DECLOP_1VAR_2IN_1OUT(char1, &) +DECLOP_1VAR_2IN_1OUT(char1, |) +DECLOP_1VAR_2IN_1OUT(char1, ^) +DECLOP_1VAR_2IN_1OUT(char1, <<) +DECLOP_1VAR_2IN_1OUT(char1, >>) + + +DECLOP_1VAR_ASSIGN(char1, +=) +DECLOP_1VAR_ASSIGN(char1, -=) +DECLOP_1VAR_ASSIGN(char1, *=) +DECLOP_1VAR_ASSIGN(char1, /=) +DECLOP_1VAR_ASSIGN(char1, %=) +DECLOP_1VAR_ASSIGN(char1, &=) +DECLOP_1VAR_ASSIGN(char1, |=) +DECLOP_1VAR_ASSIGN(char1, ^=) +DECLOP_1VAR_ASSIGN(char1, <<=) +DECLOP_1VAR_ASSIGN(char1, >>=) + +DECLOP_1VAR_PREOP(char1, ++) +DECLOP_1VAR_PREOP(char1, --) + +DECLOP_1VAR_POSTOP(char1, ++) +DECLOP_1VAR_POSTOP(char1, --) + +DECLOP_1VAR_COMP(char1, ==) +DECLOP_1VAR_COMP(char1, !=) +DECLOP_1VAR_COMP(char1, <) +DECLOP_1VAR_COMP(char1, >) +DECLOP_1VAR_COMP(char1, <=) +DECLOP_1VAR_COMP(char1, >=) + +DECLOP_1VAR_COMP(char1, &&) +DECLOP_1VAR_COMP(char1, ||) + +DECLOP_1VAR_1IN_1OUT(char1, ~) +DECLOP_1VAR_1IN_BOOLOUT(char1, !) + +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(char1, float) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(char1, double) +DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(char1, signed long long) + +// SIGNED CHAR2 + +DECLOP_2VAR_2IN_1OUT(char2, +) +DECLOP_2VAR_2IN_1OUT(char2, -) +DECLOP_2VAR_2IN_1OUT(char2, *) +DECLOP_2VAR_2IN_1OUT(char2, /) +DECLOP_2VAR_2IN_1OUT(char2, %) +DECLOP_2VAR_2IN_1OUT(char2, &) +DECLOP_2VAR_2IN_1OUT(char2, |) +DECLOP_2VAR_2IN_1OUT(char2, ^) +DECLOP_2VAR_2IN_1OUT(char2, <<) +DECLOP_2VAR_2IN_1OUT(char2, >>) + +DECLOP_2VAR_ASSIGN(char2, +=) +DECLOP_2VAR_ASSIGN(char2, -=) +DECLOP_2VAR_ASSIGN(char2, *=) +DECLOP_2VAR_ASSIGN(char2, /=) +DECLOP_2VAR_ASSIGN(char2, %=) +DECLOP_2VAR_ASSIGN(char2, &=) +DECLOP_2VAR_ASSIGN(char2, |=) +DECLOP_2VAR_ASSIGN(char2, ^=) +DECLOP_2VAR_ASSIGN(char2, <<=) +DECLOP_2VAR_ASSIGN(char2, >>=) + +DECLOP_2VAR_PREOP(char2, ++) +DECLOP_2VAR_PREOP(char2, --) + +DECLOP_2VAR_POSTOP(char2, ++) +DECLOP_2VAR_POSTOP(char2, --) + +DECLOP_2VAR_COMP(char2, ==) +DECLOP_2VAR_COMP(char2, !=) +DECLOP_2VAR_COMP(char2, <) +DECLOP_2VAR_COMP(char2, >) +DECLOP_2VAR_COMP(char2, <=) +DECLOP_2VAR_COMP(char2, >=) + +DECLOP_2VAR_COMP(char2, &&) +DECLOP_2VAR_COMP(char2, ||) + +DECLOP_2VAR_1IN_1OUT(char2, ~) +DECLOP_2VAR_1IN_BOOLOUT(char2, !) + +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(char2, float) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(char2, double) +DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(char2, signed long long) + +// SIGNED CHAR3 + +DECLOP_3VAR_2IN_1OUT(char3, +) +DECLOP_3VAR_2IN_1OUT(char3, -) +DECLOP_3VAR_2IN_1OUT(char3, *) +DECLOP_3VAR_2IN_1OUT(char3, /) +DECLOP_3VAR_2IN_1OUT(char3, %) +DECLOP_3VAR_2IN_1OUT(char3, &) +DECLOP_3VAR_2IN_1OUT(char3, |) +DECLOP_3VAR_2IN_1OUT(char3, ^) +DECLOP_3VAR_2IN_1OUT(char3, <<) +DECLOP_3VAR_2IN_1OUT(char3, >>) + +DECLOP_3VAR_ASSIGN(char3, +=) +DECLOP_3VAR_ASSIGN(char3, -=) +DECLOP_3VAR_ASSIGN(char3, *=) +DECLOP_3VAR_ASSIGN(char3, /=) +DECLOP_3VAR_ASSIGN(char3, %=) +DECLOP_3VAR_ASSIGN(char3, &=) +DECLOP_3VAR_ASSIGN(char3, |=) +DECLOP_3VAR_ASSIGN(char3, ^=) +DECLOP_3VAR_ASSIGN(char3, <<=) +DECLOP_3VAR_ASSIGN(char3, >>=) + +DECLOP_3VAR_PREOP(char3, ++) +DECLOP_3VAR_PREOP(char3, --) + +DECLOP_3VAR_POSTOP(char3, ++) +DECLOP_3VAR_POSTOP(char3, --) + +DECLOP_3VAR_COMP(char3, ==) +DECLOP_3VAR_COMP(char3, !=) +DECLOP_3VAR_COMP(char3, <) +DECLOP_3VAR_COMP(char3, >) +DECLOP_3VAR_COMP(char3, <=) +DECLOP_3VAR_COMP(char3, >=) + +DECLOP_3VAR_COMP(char3, &&) +DECLOP_3VAR_COMP(char3, ||) + +DECLOP_3VAR_1IN_1OUT(char3, ~) +DECLOP_3VAR_1IN_BOOLOUT(char3, !) + +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(char3, float) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(char3, double) +DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(char3, signed long long) + +// SIGNED CHAR4 + +DECLOP_4VAR_2IN_1OUT(char4, +) +DECLOP_4VAR_2IN_1OUT(char4, -) +DECLOP_4VAR_2IN_1OUT(char4, *) +DECLOP_4VAR_2IN_1OUT(char4, /) +DECLOP_4VAR_2IN_1OUT(char4, %) +DECLOP_4VAR_2IN_1OUT(char4, &) +DECLOP_4VAR_2IN_1OUT(char4, |) +DECLOP_4VAR_2IN_1OUT(char4, ^) +DECLOP_4VAR_2IN_1OUT(char4, <<) +DECLOP_4VAR_2IN_1OUT(char4, >>) + +DECLOP_4VAR_ASSIGN(char4, +=) +DECLOP_4VAR_ASSIGN(char4, -=) +DECLOP_4VAR_ASSIGN(char4, *=) +DECLOP_4VAR_ASSIGN(char4, /=) +DECLOP_4VAR_ASSIGN(char4, %=) +DECLOP_4VAR_ASSIGN(char4, &=) +DECLOP_4VAR_ASSIGN(char4, |=) +DECLOP_4VAR_ASSIGN(char4, ^=) +DECLOP_4VAR_ASSIGN(char4, <<=) +DECLOP_4VAR_ASSIGN(char4, >>=) + +DECLOP_4VAR_PREOP(char4, ++) +DECLOP_4VAR_PREOP(char4, --) + +DECLOP_4VAR_POSTOP(char4, ++) +DECLOP_4VAR_POSTOP(char4, --) + +DECLOP_4VAR_COMP(char4, ==) +DECLOP_4VAR_COMP(char4, !=) +DECLOP_4VAR_COMP(char4, <) +DECLOP_4VAR_COMP(char4, >) +DECLOP_4VAR_COMP(char4, <=) +DECLOP_4VAR_COMP(char4, >=) + +DECLOP_4VAR_COMP(char4, &&) +DECLOP_4VAR_COMP(char4, ||) + +DECLOP_4VAR_1IN_1OUT(char4, ~) +DECLOP_4VAR_1IN_BOOLOUT(char4, !) + +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(char4, float) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(char4, double) +DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(char4, signed long long) + +// UNSIGNED SHORT1 + +DECLOP_1VAR_2IN_1OUT(ushort1, +) +DECLOP_1VAR_2IN_1OUT(ushort1, -) +DECLOP_1VAR_2IN_1OUT(ushort1, *) +DECLOP_1VAR_2IN_1OUT(ushort1, /) +DECLOP_1VAR_2IN_1OUT(ushort1, %) +DECLOP_1VAR_2IN_1OUT(ushort1, &) +DECLOP_1VAR_2IN_1OUT(ushort1, |) +DECLOP_1VAR_2IN_1OUT(ushort1, ^) +DECLOP_1VAR_2IN_1OUT(ushort1, <<) +DECLOP_1VAR_2IN_1OUT(ushort1, >>) + + +DECLOP_1VAR_ASSIGN(ushort1, +=) +DECLOP_1VAR_ASSIGN(ushort1, -=) +DECLOP_1VAR_ASSIGN(ushort1, *=) +DECLOP_1VAR_ASSIGN(ushort1, /=) +DECLOP_1VAR_ASSIGN(ushort1, %=) +DECLOP_1VAR_ASSIGN(ushort1, &=) +DECLOP_1VAR_ASSIGN(ushort1, |=) +DECLOP_1VAR_ASSIGN(ushort1, ^=) +DECLOP_1VAR_ASSIGN(ushort1, <<=) +DECLOP_1VAR_ASSIGN(ushort1, >>=) + +DECLOP_1VAR_PREOP(ushort1, ++) +DECLOP_1VAR_PREOP(ushort1, --) + +DECLOP_1VAR_POSTOP(ushort1, ++) +DECLOP_1VAR_POSTOP(ushort1, --) + +DECLOP_1VAR_COMP(ushort1, ==) +DECLOP_1VAR_COMP(ushort1, !=) +DECLOP_1VAR_COMP(ushort1, <) +DECLOP_1VAR_COMP(ushort1, >) +DECLOP_1VAR_COMP(ushort1, <=) +DECLOP_1VAR_COMP(ushort1, >=) + +DECLOP_1VAR_COMP(ushort1, &&) +DECLOP_1VAR_COMP(ushort1, ||) + +DECLOP_1VAR_1IN_1OUT(ushort1, ~) +DECLOP_1VAR_1IN_BOOLOUT(ushort1, !) + +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, float) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, double) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long long) + +// UNSIGNED SHORT2 + +DECLOP_2VAR_2IN_1OUT(ushort2, +) +DECLOP_2VAR_2IN_1OUT(ushort2, -) +DECLOP_2VAR_2IN_1OUT(ushort2, *) +DECLOP_2VAR_2IN_1OUT(ushort2, /) +DECLOP_2VAR_2IN_1OUT(ushort2, %) +DECLOP_2VAR_2IN_1OUT(ushort2, &) +DECLOP_2VAR_2IN_1OUT(ushort2, |) +DECLOP_2VAR_2IN_1OUT(ushort2, ^) +DECLOP_2VAR_2IN_1OUT(ushort2, <<) +DECLOP_2VAR_2IN_1OUT(ushort2, >>) + +DECLOP_2VAR_ASSIGN(ushort2, +=) +DECLOP_2VAR_ASSIGN(ushort2, -=) +DECLOP_2VAR_ASSIGN(ushort2, *=) +DECLOP_2VAR_ASSIGN(ushort2, /=) +DECLOP_2VAR_ASSIGN(ushort2, %=) +DECLOP_2VAR_ASSIGN(ushort2, &=) +DECLOP_2VAR_ASSIGN(ushort2, |=) +DECLOP_2VAR_ASSIGN(ushort2, ^=) +DECLOP_2VAR_ASSIGN(ushort2, <<=) +DECLOP_2VAR_ASSIGN(ushort2, >>=) + +DECLOP_2VAR_PREOP(ushort2, ++) +DECLOP_2VAR_PREOP(ushort2, --) + +DECLOP_2VAR_POSTOP(ushort2, ++) +DECLOP_2VAR_POSTOP(ushort2, --) + +DECLOP_2VAR_COMP(ushort2, ==) +DECLOP_2VAR_COMP(ushort2, !=) +DECLOP_2VAR_COMP(ushort2, <) +DECLOP_2VAR_COMP(ushort2, >) +DECLOP_2VAR_COMP(ushort2, <=) +DECLOP_2VAR_COMP(ushort2, >=) + +DECLOP_2VAR_COMP(ushort2, &&) +DECLOP_2VAR_COMP(ushort2, ||) + +DECLOP_2VAR_1IN_1OUT(ushort2, ~) +DECLOP_2VAR_1IN_BOOLOUT(ushort2, !) + +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, float) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, double) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long long) + +// UNSIGNED SHORT3 + +DECLOP_3VAR_2IN_1OUT(ushort3, +) +DECLOP_3VAR_2IN_1OUT(ushort3, -) +DECLOP_3VAR_2IN_1OUT(ushort3, *) +DECLOP_3VAR_2IN_1OUT(ushort3, /) +DECLOP_3VAR_2IN_1OUT(ushort3, %) +DECLOP_3VAR_2IN_1OUT(ushort3, &) +DECLOP_3VAR_2IN_1OUT(ushort3, |) +DECLOP_3VAR_2IN_1OUT(ushort3, ^) +DECLOP_3VAR_2IN_1OUT(ushort3, <<) +DECLOP_3VAR_2IN_1OUT(ushort3, >>) + +DECLOP_3VAR_ASSIGN(ushort3, +=) +DECLOP_3VAR_ASSIGN(ushort3, -=) +DECLOP_3VAR_ASSIGN(ushort3, *=) +DECLOP_3VAR_ASSIGN(ushort3, /=) +DECLOP_3VAR_ASSIGN(ushort3, %=) +DECLOP_3VAR_ASSIGN(ushort3, &=) +DECLOP_3VAR_ASSIGN(ushort3, |=) +DECLOP_3VAR_ASSIGN(ushort3, ^=) +DECLOP_3VAR_ASSIGN(ushort3, <<=) +DECLOP_3VAR_ASSIGN(ushort3, >>=) + +DECLOP_3VAR_PREOP(ushort3, ++) +DECLOP_3VAR_PREOP(ushort3, --) + +DECLOP_3VAR_POSTOP(ushort3, ++) +DECLOP_3VAR_POSTOP(ushort3, --) + +DECLOP_3VAR_COMP(ushort3, ==) +DECLOP_3VAR_COMP(ushort3, !=) +DECLOP_3VAR_COMP(ushort3, <) +DECLOP_3VAR_COMP(ushort3, >) +DECLOP_3VAR_COMP(ushort3, <=) +DECLOP_3VAR_COMP(ushort3, >=) + +DECLOP_3VAR_COMP(ushort3, &&) +DECLOP_3VAR_COMP(ushort3, ||) + +DECLOP_3VAR_1IN_1OUT(ushort3, ~) +DECLOP_3VAR_1IN_BOOLOUT(ushort3, !) + +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, float) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, double) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long long) + +// UNSIGNED SHORT4 + +DECLOP_4VAR_2IN_1OUT(ushort4, +) +DECLOP_4VAR_2IN_1OUT(ushort4, -) +DECLOP_4VAR_2IN_1OUT(ushort4, *) +DECLOP_4VAR_2IN_1OUT(ushort4, /) +DECLOP_4VAR_2IN_1OUT(ushort4, %) +DECLOP_4VAR_2IN_1OUT(ushort4, &) +DECLOP_4VAR_2IN_1OUT(ushort4, |) +DECLOP_4VAR_2IN_1OUT(ushort4, ^) +DECLOP_4VAR_2IN_1OUT(ushort4, <<) +DECLOP_4VAR_2IN_1OUT(ushort4, >>) + +DECLOP_4VAR_ASSIGN(ushort4, +=) +DECLOP_4VAR_ASSIGN(ushort4, -=) +DECLOP_4VAR_ASSIGN(ushort4, *=) +DECLOP_4VAR_ASSIGN(ushort4, /=) +DECLOP_4VAR_ASSIGN(ushort4, %=) +DECLOP_4VAR_ASSIGN(ushort4, &=) +DECLOP_4VAR_ASSIGN(ushort4, |=) +DECLOP_4VAR_ASSIGN(ushort4, ^=) +DECLOP_4VAR_ASSIGN(ushort4, <<=) +DECLOP_4VAR_ASSIGN(ushort4, >>=) + +DECLOP_4VAR_PREOP(ushort4, ++) +DECLOP_4VAR_PREOP(ushort4, --) + +DECLOP_4VAR_POSTOP(ushort4, ++) +DECLOP_4VAR_POSTOP(ushort4, --) + +DECLOP_4VAR_COMP(ushort4, ==) +DECLOP_4VAR_COMP(ushort4, !=) +DECLOP_4VAR_COMP(ushort4, <) +DECLOP_4VAR_COMP(ushort4, >) +DECLOP_4VAR_COMP(ushort4, <=) +DECLOP_4VAR_COMP(ushort4, >=) + +DECLOP_4VAR_COMP(ushort4, &&) +DECLOP_4VAR_COMP(ushort4, ||) + +DECLOP_4VAR_1IN_1OUT(ushort4, ~) +DECLOP_4VAR_1IN_BOOLOUT(ushort4, !) + +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, float) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, double) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long long) + +// SIGNED SHORT1 + +DECLOP_1VAR_2IN_1OUT(short1, +) +DECLOP_1VAR_2IN_1OUT(short1, -) +DECLOP_1VAR_2IN_1OUT(short1, *) +DECLOP_1VAR_2IN_1OUT(short1, /) +DECLOP_1VAR_2IN_1OUT(short1, %) +DECLOP_1VAR_2IN_1OUT(short1, &) +DECLOP_1VAR_2IN_1OUT(short1, |) +DECLOP_1VAR_2IN_1OUT(short1, ^) +DECLOP_1VAR_2IN_1OUT(short1, <<) +DECLOP_1VAR_2IN_1OUT(short1, >>) + + +DECLOP_1VAR_ASSIGN(short1, +=) +DECLOP_1VAR_ASSIGN(short1, -=) +DECLOP_1VAR_ASSIGN(short1, *=) +DECLOP_1VAR_ASSIGN(short1, /=) +DECLOP_1VAR_ASSIGN(short1, %=) +DECLOP_1VAR_ASSIGN(short1, &=) +DECLOP_1VAR_ASSIGN(short1, |=) +DECLOP_1VAR_ASSIGN(short1, ^=) +DECLOP_1VAR_ASSIGN(short1, <<=) +DECLOP_1VAR_ASSIGN(short1, >>=) + +DECLOP_1VAR_PREOP(short1, ++) +DECLOP_1VAR_PREOP(short1, --) + +DECLOP_1VAR_POSTOP(short1, ++) +DECLOP_1VAR_POSTOP(short1, --) + +DECLOP_1VAR_COMP(short1, ==) +DECLOP_1VAR_COMP(short1, !=) +DECLOP_1VAR_COMP(short1, <) +DECLOP_1VAR_COMP(short1, >) +DECLOP_1VAR_COMP(short1, <=) +DECLOP_1VAR_COMP(short1, >=) + +DECLOP_1VAR_COMP(short1, &&) +DECLOP_1VAR_COMP(short1, ||) + +DECLOP_1VAR_1IN_1OUT(short1, ~) +DECLOP_1VAR_1IN_BOOLOUT(short1, !) + +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(short1, float) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(short1, double) +DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(short1, signed long long) + +// SIGNED SHORT2 + +DECLOP_2VAR_2IN_1OUT(short2, +) +DECLOP_2VAR_2IN_1OUT(short2, -) +DECLOP_2VAR_2IN_1OUT(short2, *) +DECLOP_2VAR_2IN_1OUT(short2, /) +DECLOP_2VAR_2IN_1OUT(short2, %) +DECLOP_2VAR_2IN_1OUT(short2, &) +DECLOP_2VAR_2IN_1OUT(short2, |) +DECLOP_2VAR_2IN_1OUT(short2, ^) +DECLOP_2VAR_2IN_1OUT(short2, <<) +DECLOP_2VAR_2IN_1OUT(short2, >>) + +DECLOP_2VAR_ASSIGN(short2, +=) +DECLOP_2VAR_ASSIGN(short2, -=) +DECLOP_2VAR_ASSIGN(short2, *=) +DECLOP_2VAR_ASSIGN(short2, /=) +DECLOP_2VAR_ASSIGN(short2, %=) +DECLOP_2VAR_ASSIGN(short2, &=) +DECLOP_2VAR_ASSIGN(short2, |=) +DECLOP_2VAR_ASSIGN(short2, ^=) +DECLOP_2VAR_ASSIGN(short2, <<=) +DECLOP_2VAR_ASSIGN(short2, >>=) + +DECLOP_2VAR_PREOP(short2, ++) +DECLOP_2VAR_PREOP(short2, --) + +DECLOP_2VAR_POSTOP(short2, ++) +DECLOP_2VAR_POSTOP(short2, --) + +DECLOP_2VAR_COMP(short2, ==) +DECLOP_2VAR_COMP(short2, !=) +DECLOP_2VAR_COMP(short2, <) +DECLOP_2VAR_COMP(short2, >) +DECLOP_2VAR_COMP(short2, <=) +DECLOP_2VAR_COMP(short2, >=) + +DECLOP_2VAR_COMP(short2, &&) +DECLOP_2VAR_COMP(short2, ||) + +DECLOP_2VAR_1IN_1OUT(short2, ~) +DECLOP_2VAR_1IN_BOOLOUT(short2, !) + +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(short2, float) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(short2, double) +DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(short2, signed long long) + +// SIGNED SHORT3 + +DECLOP_3VAR_2IN_1OUT(short3, +) +DECLOP_3VAR_2IN_1OUT(short3, -) +DECLOP_3VAR_2IN_1OUT(short3, *) +DECLOP_3VAR_2IN_1OUT(short3, /) +DECLOP_3VAR_2IN_1OUT(short3, %) +DECLOP_3VAR_2IN_1OUT(short3, &) +DECLOP_3VAR_2IN_1OUT(short3, |) +DECLOP_3VAR_2IN_1OUT(short3, ^) +DECLOP_3VAR_2IN_1OUT(short3, <<) +DECLOP_3VAR_2IN_1OUT(short3, >>) + +DECLOP_3VAR_ASSIGN(short3, +=) +DECLOP_3VAR_ASSIGN(short3, -=) +DECLOP_3VAR_ASSIGN(short3, *=) +DECLOP_3VAR_ASSIGN(short3, /=) +DECLOP_3VAR_ASSIGN(short3, %=) +DECLOP_3VAR_ASSIGN(short3, &=) +DECLOP_3VAR_ASSIGN(short3, |=) +DECLOP_3VAR_ASSIGN(short3, ^=) +DECLOP_3VAR_ASSIGN(short3, <<=) +DECLOP_3VAR_ASSIGN(short3, >>=) + +DECLOP_3VAR_PREOP(short3, ++) +DECLOP_3VAR_PREOP(short3, --) + +DECLOP_3VAR_POSTOP(short3, ++) +DECLOP_3VAR_POSTOP(short3, --) + +DECLOP_3VAR_COMP(short3, ==) +DECLOP_3VAR_COMP(short3, !=) +DECLOP_3VAR_COMP(short3, <) +DECLOP_3VAR_COMP(short3, >) +DECLOP_3VAR_COMP(short3, <=) +DECLOP_3VAR_COMP(short3, >=) + +DECLOP_3VAR_COMP(short3, &&) +DECLOP_3VAR_COMP(short3, ||) + +DECLOP_3VAR_1IN_1OUT(short3, ~) +DECLOP_3VAR_1IN_BOOLOUT(short3, !) + +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(short3, float) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(short3, double) +DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(short3, signed long long) + +// SIGNED SHORT4 + +DECLOP_4VAR_2IN_1OUT(short4, +) +DECLOP_4VAR_2IN_1OUT(short4, -) +DECLOP_4VAR_2IN_1OUT(short4, *) +DECLOP_4VAR_2IN_1OUT(short4, /) +DECLOP_4VAR_2IN_1OUT(short4, %) +DECLOP_4VAR_2IN_1OUT(short4, &) +DECLOP_4VAR_2IN_1OUT(short4, |) +DECLOP_4VAR_2IN_1OUT(short4, ^) +DECLOP_4VAR_2IN_1OUT(short4, <<) +DECLOP_4VAR_2IN_1OUT(short4, >>) + +DECLOP_4VAR_ASSIGN(short4, +=) +DECLOP_4VAR_ASSIGN(short4, -=) +DECLOP_4VAR_ASSIGN(short4, *=) +DECLOP_4VAR_ASSIGN(short4, /=) +DECLOP_4VAR_ASSIGN(short4, %=) +DECLOP_4VAR_ASSIGN(short4, &=) +DECLOP_4VAR_ASSIGN(short4, |=) +DECLOP_4VAR_ASSIGN(short4, ^=) +DECLOP_4VAR_ASSIGN(short4, <<=) +DECLOP_4VAR_ASSIGN(short4, >>=) + +DECLOP_4VAR_PREOP(short4, ++) +DECLOP_4VAR_PREOP(short4, --) + +DECLOP_4VAR_POSTOP(short4, ++) +DECLOP_4VAR_POSTOP(short4, --) + +DECLOP_4VAR_COMP(short4, ==) +DECLOP_4VAR_COMP(short4, !=) +DECLOP_4VAR_COMP(short4, <) +DECLOP_4VAR_COMP(short4, >) +DECLOP_4VAR_COMP(short4, <=) +DECLOP_4VAR_COMP(short4, >=) + +DECLOP_4VAR_COMP(short4, &&) +DECLOP_4VAR_COMP(short4, ||) + +DECLOP_4VAR_1IN_1OUT(short4, ~) +DECLOP_4VAR_1IN_BOOLOUT(short4, !) + +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(short4, float) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(short4, double) +DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(short4, signed long long) + +// UNSIGNED INT1 + +DECLOP_1VAR_2IN_1OUT(uint1, +) +DECLOP_1VAR_2IN_1OUT(uint1, -) +DECLOP_1VAR_2IN_1OUT(uint1, *) +DECLOP_1VAR_2IN_1OUT(uint1, /) +DECLOP_1VAR_2IN_1OUT(uint1, %) +DECLOP_1VAR_2IN_1OUT(uint1, &) +DECLOP_1VAR_2IN_1OUT(uint1, |) +DECLOP_1VAR_2IN_1OUT(uint1, ^) +DECLOP_1VAR_2IN_1OUT(uint1, <<) +DECLOP_1VAR_2IN_1OUT(uint1, >>) + + +DECLOP_1VAR_ASSIGN(uint1, +=) +DECLOP_1VAR_ASSIGN(uint1, -=) +DECLOP_1VAR_ASSIGN(uint1, *=) +DECLOP_1VAR_ASSIGN(uint1, /=) +DECLOP_1VAR_ASSIGN(uint1, %=) +DECLOP_1VAR_ASSIGN(uint1, &=) +DECLOP_1VAR_ASSIGN(uint1, |=) +DECLOP_1VAR_ASSIGN(uint1, ^=) +DECLOP_1VAR_ASSIGN(uint1, <<=) +DECLOP_1VAR_ASSIGN(uint1, >>=) + +DECLOP_1VAR_PREOP(uint1, ++) +DECLOP_1VAR_PREOP(uint1, --) + +DECLOP_1VAR_POSTOP(uint1, ++) +DECLOP_1VAR_POSTOP(uint1, --) + +DECLOP_1VAR_COMP(uint1, ==) +DECLOP_1VAR_COMP(uint1, !=) +DECLOP_1VAR_COMP(uint1, <) +DECLOP_1VAR_COMP(uint1, >) +DECLOP_1VAR_COMP(uint1, <=) +DECLOP_1VAR_COMP(uint1, >=) + +DECLOP_1VAR_COMP(uint1, &&) +DECLOP_1VAR_COMP(uint1, ||) + +DECLOP_1VAR_1IN_1OUT(uint1, ~) +DECLOP_1VAR_1IN_BOOLOUT(uint1, !) + +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(uint1, float) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(uint1, double) +DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long long) + +// UNSIGNED INT2 + +DECLOP_2VAR_2IN_1OUT(uint2, +) +DECLOP_2VAR_2IN_1OUT(uint2, -) +DECLOP_2VAR_2IN_1OUT(uint2, *) +DECLOP_2VAR_2IN_1OUT(uint2, /) +DECLOP_2VAR_2IN_1OUT(uint2, %) +DECLOP_2VAR_2IN_1OUT(uint2, &) +DECLOP_2VAR_2IN_1OUT(uint2, |) +DECLOP_2VAR_2IN_1OUT(uint2, ^) +DECLOP_2VAR_2IN_1OUT(uint2, <<) +DECLOP_2VAR_2IN_1OUT(uint2, >>) + +DECLOP_2VAR_ASSIGN(uint2, +=) +DECLOP_2VAR_ASSIGN(uint2, -=) +DECLOP_2VAR_ASSIGN(uint2, *=) +DECLOP_2VAR_ASSIGN(uint2, /=) +DECLOP_2VAR_ASSIGN(uint2, %=) +DECLOP_2VAR_ASSIGN(uint2, &=) +DECLOP_2VAR_ASSIGN(uint2, |=) +DECLOP_2VAR_ASSIGN(uint2, ^=) +DECLOP_2VAR_ASSIGN(uint2, <<=) +DECLOP_2VAR_ASSIGN(uint2, >>=) + +DECLOP_2VAR_PREOP(uint2, ++) +DECLOP_2VAR_PREOP(uint2, --) + +DECLOP_2VAR_POSTOP(uint2, ++) +DECLOP_2VAR_POSTOP(uint2, --) + +DECLOP_2VAR_COMP(uint2, ==) +DECLOP_2VAR_COMP(uint2, !=) +DECLOP_2VAR_COMP(uint2, <) +DECLOP_2VAR_COMP(uint2, >) +DECLOP_2VAR_COMP(uint2, <=) +DECLOP_2VAR_COMP(uint2, >=) + +DECLOP_2VAR_COMP(uint2, &&) +DECLOP_2VAR_COMP(uint2, ||) + +DECLOP_2VAR_1IN_1OUT(uint2, ~) +DECLOP_2VAR_1IN_BOOLOUT(uint2, !) + +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(uint2, float) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(uint2, double) +DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long long) + +// UNSIGNED INT3 + +DECLOP_3VAR_2IN_1OUT(uint3, +) +DECLOP_3VAR_2IN_1OUT(uint3, -) +DECLOP_3VAR_2IN_1OUT(uint3, *) +DECLOP_3VAR_2IN_1OUT(uint3, /) +DECLOP_3VAR_2IN_1OUT(uint3, %) +DECLOP_3VAR_2IN_1OUT(uint3, &) +DECLOP_3VAR_2IN_1OUT(uint3, |) +DECLOP_3VAR_2IN_1OUT(uint3, ^) +DECLOP_3VAR_2IN_1OUT(uint3, <<) +DECLOP_3VAR_2IN_1OUT(uint3, >>) + +DECLOP_3VAR_ASSIGN(uint3, +=) +DECLOP_3VAR_ASSIGN(uint3, -=) +DECLOP_3VAR_ASSIGN(uint3, *=) +DECLOP_3VAR_ASSIGN(uint3, /=) +DECLOP_3VAR_ASSIGN(uint3, %=) +DECLOP_3VAR_ASSIGN(uint3, &=) +DECLOP_3VAR_ASSIGN(uint3, |=) +DECLOP_3VAR_ASSIGN(uint3, ^=) +DECLOP_3VAR_ASSIGN(uint3, <<=) +DECLOP_3VAR_ASSIGN(uint3, >>=) + +DECLOP_3VAR_PREOP(uint3, ++) +DECLOP_3VAR_PREOP(uint3, --) + +DECLOP_3VAR_POSTOP(uint3, ++) +DECLOP_3VAR_POSTOP(uint3, --) + +DECLOP_3VAR_COMP(uint3, ==) +DECLOP_3VAR_COMP(uint3, !=) +DECLOP_3VAR_COMP(uint3, <) +DECLOP_3VAR_COMP(uint3, >) +DECLOP_3VAR_COMP(uint3, <=) +DECLOP_3VAR_COMP(uint3, >=) + +DECLOP_3VAR_COMP(uint3, &&) +DECLOP_3VAR_COMP(uint3, ||) + +DECLOP_3VAR_1IN_1OUT(uint3, ~) +DECLOP_3VAR_1IN_BOOLOUT(uint3, !) + +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(uint3, float) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(uint3, double) +DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long long) + +// UNSIGNED INT4 + +DECLOP_4VAR_2IN_1OUT(uint4, +) +DECLOP_4VAR_2IN_1OUT(uint4, -) +DECLOP_4VAR_2IN_1OUT(uint4, *) +DECLOP_4VAR_2IN_1OUT(uint4, /) +DECLOP_4VAR_2IN_1OUT(uint4, %) +DECLOP_4VAR_2IN_1OUT(uint4, &) +DECLOP_4VAR_2IN_1OUT(uint4, |) +DECLOP_4VAR_2IN_1OUT(uint4, ^) +DECLOP_4VAR_2IN_1OUT(uint4, <<) +DECLOP_4VAR_2IN_1OUT(uint4, >>) + +DECLOP_4VAR_ASSIGN(uint4, +=) +DECLOP_4VAR_ASSIGN(uint4, -=) +DECLOP_4VAR_ASSIGN(uint4, *=) +DECLOP_4VAR_ASSIGN(uint4, /=) +DECLOP_4VAR_ASSIGN(uint4, %=) +DECLOP_4VAR_ASSIGN(uint4, &=) +DECLOP_4VAR_ASSIGN(uint4, |=) +DECLOP_4VAR_ASSIGN(uint4, ^=) +DECLOP_4VAR_ASSIGN(uint4, <<=) +DECLOP_4VAR_ASSIGN(uint4, >>=) + +DECLOP_4VAR_PREOP(uint4, ++) +DECLOP_4VAR_PREOP(uint4, --) + +DECLOP_4VAR_POSTOP(uint4, ++) +DECLOP_4VAR_POSTOP(uint4, --) + +DECLOP_4VAR_COMP(uint4, ==) +DECLOP_4VAR_COMP(uint4, !=) +DECLOP_4VAR_COMP(uint4, <) +DECLOP_4VAR_COMP(uint4, >) +DECLOP_4VAR_COMP(uint4, <=) +DECLOP_4VAR_COMP(uint4, >=) + +DECLOP_4VAR_COMP(uint4, &&) +DECLOP_4VAR_COMP(uint4, ||) + +DECLOP_4VAR_1IN_1OUT(uint4, ~) +DECLOP_4VAR_1IN_BOOLOUT(uint4, !) + +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(uint4, float) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(uint4, double) +DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long long) + +// SIGNED INT1 + +DECLOP_1VAR_2IN_1OUT(int1, +) +DECLOP_1VAR_2IN_1OUT(int1, -) +DECLOP_1VAR_2IN_1OUT(int1, *) +DECLOP_1VAR_2IN_1OUT(int1, /) +DECLOP_1VAR_2IN_1OUT(int1, %) +DECLOP_1VAR_2IN_1OUT(int1, &) +DECLOP_1VAR_2IN_1OUT(int1, |) +DECLOP_1VAR_2IN_1OUT(int1, ^) +DECLOP_1VAR_2IN_1OUT(int1, <<) +DECLOP_1VAR_2IN_1OUT(int1, >>) + + +DECLOP_1VAR_ASSIGN(int1, +=) +DECLOP_1VAR_ASSIGN(int1, -=) +DECLOP_1VAR_ASSIGN(int1, *=) +DECLOP_1VAR_ASSIGN(int1, /=) +DECLOP_1VAR_ASSIGN(int1, %=) +DECLOP_1VAR_ASSIGN(int1, &=) +DECLOP_1VAR_ASSIGN(int1, |=) +DECLOP_1VAR_ASSIGN(int1, ^=) +DECLOP_1VAR_ASSIGN(int1, <<=) +DECLOP_1VAR_ASSIGN(int1, >>=) + +DECLOP_1VAR_PREOP(int1, ++) +DECLOP_1VAR_PREOP(int1, --) + +DECLOP_1VAR_POSTOP(int1, ++) +DECLOP_1VAR_POSTOP(int1, --) + +DECLOP_1VAR_COMP(int1, ==) +DECLOP_1VAR_COMP(int1, !=) +DECLOP_1VAR_COMP(int1, <) +DECLOP_1VAR_COMP(int1, >) +DECLOP_1VAR_COMP(int1, <=) +DECLOP_1VAR_COMP(int1, >=) + +DECLOP_1VAR_COMP(int1, &&) +DECLOP_1VAR_COMP(int1, ||) + +DECLOP_1VAR_1IN_1OUT(int1, ~) +DECLOP_1VAR_1IN_BOOLOUT(int1, !) + +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(int1, float) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(int1, double) +DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(int1, signed long long) + +// SIGNED INT2 + +DECLOP_2VAR_2IN_1OUT(int2, +) +DECLOP_2VAR_2IN_1OUT(int2, -) +DECLOP_2VAR_2IN_1OUT(int2, *) +DECLOP_2VAR_2IN_1OUT(int2, /) +DECLOP_2VAR_2IN_1OUT(int2, %) +DECLOP_2VAR_2IN_1OUT(int2, &) +DECLOP_2VAR_2IN_1OUT(int2, |) +DECLOP_2VAR_2IN_1OUT(int2, ^) +DECLOP_2VAR_2IN_1OUT(int2, <<) +DECLOP_2VAR_2IN_1OUT(int2, >>) + +DECLOP_2VAR_ASSIGN(int2, +=) +DECLOP_2VAR_ASSIGN(int2, -=) +DECLOP_2VAR_ASSIGN(int2, *=) +DECLOP_2VAR_ASSIGN(int2, /=) +DECLOP_2VAR_ASSIGN(int2, %=) +DECLOP_2VAR_ASSIGN(int2, &=) +DECLOP_2VAR_ASSIGN(int2, |=) +DECLOP_2VAR_ASSIGN(int2, ^=) +DECLOP_2VAR_ASSIGN(int2, <<=) +DECLOP_2VAR_ASSIGN(int2, >>=) + +DECLOP_2VAR_PREOP(int2, ++) +DECLOP_2VAR_PREOP(int2, --) + +DECLOP_2VAR_POSTOP(int2, ++) +DECLOP_2VAR_POSTOP(int2, --) + +DECLOP_2VAR_COMP(int2, ==) +DECLOP_2VAR_COMP(int2, !=) +DECLOP_2VAR_COMP(int2, <) +DECLOP_2VAR_COMP(int2, >) +DECLOP_2VAR_COMP(int2, <=) +DECLOP_2VAR_COMP(int2, >=) + +DECLOP_2VAR_COMP(int2, &&) +DECLOP_2VAR_COMP(int2, ||) + +DECLOP_2VAR_1IN_1OUT(int2, ~) +DECLOP_2VAR_1IN_BOOLOUT(int2, !) + +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(int2, float) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(int2, double) +DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(int2, signed long long) + +// SIGNED INT3 + +DECLOP_3VAR_2IN_1OUT(int3, +) +DECLOP_3VAR_2IN_1OUT(int3, -) +DECLOP_3VAR_2IN_1OUT(int3, *) +DECLOP_3VAR_2IN_1OUT(int3, /) +DECLOP_3VAR_2IN_1OUT(int3, %) +DECLOP_3VAR_2IN_1OUT(int3, &) +DECLOP_3VAR_2IN_1OUT(int3, |) +DECLOP_3VAR_2IN_1OUT(int3, ^) +DECLOP_3VAR_2IN_1OUT(int3, <<) +DECLOP_3VAR_2IN_1OUT(int3, >>) + +DECLOP_3VAR_ASSIGN(int3, +=) +DECLOP_3VAR_ASSIGN(int3, -=) +DECLOP_3VAR_ASSIGN(int3, *=) +DECLOP_3VAR_ASSIGN(int3, /=) +DECLOP_3VAR_ASSIGN(int3, %=) +DECLOP_3VAR_ASSIGN(int3, &=) +DECLOP_3VAR_ASSIGN(int3, |=) +DECLOP_3VAR_ASSIGN(int3, ^=) +DECLOP_3VAR_ASSIGN(int3, <<=) +DECLOP_3VAR_ASSIGN(int3, >>=) + +DECLOP_3VAR_PREOP(int3, ++) +DECLOP_3VAR_PREOP(int3, --) + +DECLOP_3VAR_POSTOP(int3, ++) +DECLOP_3VAR_POSTOP(int3, --) + +DECLOP_3VAR_COMP(int3, ==) +DECLOP_3VAR_COMP(int3, !=) +DECLOP_3VAR_COMP(int3, <) +DECLOP_3VAR_COMP(int3, >) +DECLOP_3VAR_COMP(int3, <=) +DECLOP_3VAR_COMP(int3, >=) + +DECLOP_3VAR_COMP(int3, &&) +DECLOP_3VAR_COMP(int3, ||) + +DECLOP_3VAR_1IN_1OUT(int3, ~) +DECLOP_3VAR_1IN_BOOLOUT(int3, !) + +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(int3, float) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(int3, double) +DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(int3, signed long long) + +// SIGNED INT4 + +DECLOP_4VAR_2IN_1OUT(int4, +) +DECLOP_4VAR_2IN_1OUT(int4, -) +DECLOP_4VAR_2IN_1OUT(int4, *) +DECLOP_4VAR_2IN_1OUT(int4, /) +DECLOP_4VAR_2IN_1OUT(int4, %) +DECLOP_4VAR_2IN_1OUT(int4, &) +DECLOP_4VAR_2IN_1OUT(int4, |) +DECLOP_4VAR_2IN_1OUT(int4, ^) +DECLOP_4VAR_2IN_1OUT(int4, <<) +DECLOP_4VAR_2IN_1OUT(int4, >>) + +DECLOP_4VAR_ASSIGN(int4, +=) +DECLOP_4VAR_ASSIGN(int4, -=) +DECLOP_4VAR_ASSIGN(int4, *=) +DECLOP_4VAR_ASSIGN(int4, /=) +DECLOP_4VAR_ASSIGN(int4, %=) +DECLOP_4VAR_ASSIGN(int4, &=) +DECLOP_4VAR_ASSIGN(int4, |=) +DECLOP_4VAR_ASSIGN(int4, ^=) +DECLOP_4VAR_ASSIGN(int4, <<=) +DECLOP_4VAR_ASSIGN(int4, >>=) + +DECLOP_4VAR_PREOP(int4, ++) +DECLOP_4VAR_PREOP(int4, --) + +DECLOP_4VAR_POSTOP(int4, ++) +DECLOP_4VAR_POSTOP(int4, --) + +DECLOP_4VAR_COMP(int4, ==) +DECLOP_4VAR_COMP(int4, !=) +DECLOP_4VAR_COMP(int4, <) +DECLOP_4VAR_COMP(int4, >) +DECLOP_4VAR_COMP(int4, <=) +DECLOP_4VAR_COMP(int4, >=) + +DECLOP_4VAR_COMP(int4, &&) +DECLOP_4VAR_COMP(int4, ||) + +DECLOP_4VAR_1IN_1OUT(int4, ~) +DECLOP_4VAR_1IN_BOOLOUT(int4, !) + +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(int4, float) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(int4, double) +DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(int4, signed long long) + +// FLOAT1 + +DECLOP_1VAR_2IN_1OUT(float1, +) +DECLOP_1VAR_2IN_1OUT(float1, -) +DECLOP_1VAR_2IN_1OUT(float1, *) +DECLOP_1VAR_2IN_1OUT(float1, /) + +DECLOP_1VAR_ASSIGN(float1, +=) +DECLOP_1VAR_ASSIGN(float1, -=) +DECLOP_1VAR_ASSIGN(float1, *=) +DECLOP_1VAR_ASSIGN(float1, /=) + +DECLOP_1VAR_PREOP(float1, ++) +DECLOP_1VAR_PREOP(float1, --) + +DECLOP_1VAR_POSTOP(float1, ++) +DECLOP_1VAR_POSTOP(float1, --) + +DECLOP_1VAR_COMP(float1, ==) +DECLOP_1VAR_COMP(float1, !=) +DECLOP_1VAR_COMP(float1, <) +DECLOP_1VAR_COMP(float1, >) +DECLOP_1VAR_COMP(float1, <=) +DECLOP_1VAR_COMP(float1, >=) + +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(float1, float) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(float1, double) +DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(float1, signed long long) + +// FLOAT2 + +DECLOP_2VAR_2IN_1OUT(float2, +) +DECLOP_2VAR_2IN_1OUT(float2, -) +DECLOP_2VAR_2IN_1OUT(float2, *) +DECLOP_2VAR_2IN_1OUT(float2, /) + +DECLOP_2VAR_ASSIGN(float2, +=) +DECLOP_2VAR_ASSIGN(float2, -=) +DECLOP_2VAR_ASSIGN(float2, *=) +DECLOP_2VAR_ASSIGN(float2, /=) + +DECLOP_2VAR_PREOP(float2, ++) +DECLOP_2VAR_PREOP(float2, --) + +DECLOP_2VAR_POSTOP(float2, ++) +DECLOP_2VAR_POSTOP(float2, --) + +DECLOP_2VAR_COMP(float2, ==) +DECLOP_2VAR_COMP(float2, !=) +DECLOP_2VAR_COMP(float2, <) +DECLOP_2VAR_COMP(float2, >) +DECLOP_2VAR_COMP(float2, <=) +DECLOP_2VAR_COMP(float2, >=) + +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(float2, float) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(float2, double) +DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(float2, signed long long) + +// FLOAT3 + +DECLOP_3VAR_2IN_1OUT(float3, +) +DECLOP_3VAR_2IN_1OUT(float3, -) +DECLOP_3VAR_2IN_1OUT(float3, *) +DECLOP_3VAR_2IN_1OUT(float3, /) + +DECLOP_3VAR_ASSIGN(float3, +=) +DECLOP_3VAR_ASSIGN(float3, -=) +DECLOP_3VAR_ASSIGN(float3, *=) +DECLOP_3VAR_ASSIGN(float3, /=) + +DECLOP_3VAR_PREOP(float3, ++) +DECLOP_3VAR_PREOP(float3, --) + +DECLOP_3VAR_POSTOP(float3, ++) +DECLOP_3VAR_POSTOP(float3, --) + +DECLOP_3VAR_COMP(float3, ==) +DECLOP_3VAR_COMP(float3, !=) +DECLOP_3VAR_COMP(float3, <) +DECLOP_3VAR_COMP(float3, >) +DECLOP_3VAR_COMP(float3, <=) +DECLOP_3VAR_COMP(float3, >=) + +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(float3, float) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(float3, double) +DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(float3, signed long long) + +// FLOAT4 + +DECLOP_4VAR_2IN_1OUT(float4, +) +DECLOP_4VAR_2IN_1OUT(float4, -) +DECLOP_4VAR_2IN_1OUT(float4, *) +DECLOP_4VAR_2IN_1OUT(float4, /) + +DECLOP_4VAR_ASSIGN(float4, +=) +DECLOP_4VAR_ASSIGN(float4, -=) +DECLOP_4VAR_ASSIGN(float4, *=) +DECLOP_4VAR_ASSIGN(float4, /=) + +DECLOP_4VAR_PREOP(float4, ++) +DECLOP_4VAR_PREOP(float4, --) + +DECLOP_4VAR_POSTOP(float4, ++) +DECLOP_4VAR_POSTOP(float4, --) + +DECLOP_4VAR_COMP(float4, ==) +DECLOP_4VAR_COMP(float4, !=) +DECLOP_4VAR_COMP(float4, <) +DECLOP_4VAR_COMP(float4, >) +DECLOP_4VAR_COMP(float4, <=) +DECLOP_4VAR_COMP(float4, >=) + +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(float4, float) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(float4, double) +DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(float4, signed long long) + +// DOUBLE1 + +DECLOP_1VAR_2IN_1OUT(double1, +) +DECLOP_1VAR_2IN_1OUT(double1, -) +DECLOP_1VAR_2IN_1OUT(double1, *) +DECLOP_1VAR_2IN_1OUT(double1, /) + +DECLOP_1VAR_ASSIGN(double1, +=) +DECLOP_1VAR_ASSIGN(double1, -=) +DECLOP_1VAR_ASSIGN(double1, *=) +DECLOP_1VAR_ASSIGN(double1, /=) + +DECLOP_1VAR_PREOP(double1, ++) +DECLOP_1VAR_PREOP(double1, --) + +DECLOP_1VAR_POSTOP(double1, ++) +DECLOP_1VAR_POSTOP(double1, --) + +DECLOP_1VAR_COMP(double1, ==) +DECLOP_1VAR_COMP(double1, !=) +DECLOP_1VAR_COMP(double1, <) +DECLOP_1VAR_COMP(double1, >) +DECLOP_1VAR_COMP(double1, <=) +DECLOP_1VAR_COMP(double1, >=) + +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(double1, float) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(double1, double) +DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(double1, signed long long) + +// DOUBLE2 + +DECLOP_2VAR_2IN_1OUT(double2, +) +DECLOP_2VAR_2IN_1OUT(double2, -) +DECLOP_2VAR_2IN_1OUT(double2, *) +DECLOP_2VAR_2IN_1OUT(double2, /) + +DECLOP_2VAR_ASSIGN(double2, +=) +DECLOP_2VAR_ASSIGN(double2, -=) +DECLOP_2VAR_ASSIGN(double2, *=) +DECLOP_2VAR_ASSIGN(double2, /=) + +DECLOP_2VAR_PREOP(double2, ++) +DECLOP_2VAR_PREOP(double2, --) + +DECLOP_2VAR_POSTOP(double2, ++) +DECLOP_2VAR_POSTOP(double2, --) + +DECLOP_2VAR_COMP(double2, ==) +DECLOP_2VAR_COMP(double2, !=) +DECLOP_2VAR_COMP(double2, <) +DECLOP_2VAR_COMP(double2, >) +DECLOP_2VAR_COMP(double2, <=) +DECLOP_2VAR_COMP(double2, >=) + +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(double2, float) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(double2, double) +DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(double2, signed long long) + +// DOUBLE3 + +DECLOP_3VAR_2IN_1OUT(double3, +) +DECLOP_3VAR_2IN_1OUT(double3, -) +DECLOP_3VAR_2IN_1OUT(double3, *) +DECLOP_3VAR_2IN_1OUT(double3, /) + +DECLOP_3VAR_ASSIGN(double3, +=) +DECLOP_3VAR_ASSIGN(double3, -=) +DECLOP_3VAR_ASSIGN(double3, *=) +DECLOP_3VAR_ASSIGN(double3, /=) + +DECLOP_3VAR_PREOP(double3, ++) +DECLOP_3VAR_PREOP(double3, --) + +DECLOP_3VAR_POSTOP(double3, ++) +DECLOP_3VAR_POSTOP(double3, --) + +DECLOP_3VAR_COMP(double3, ==) +DECLOP_3VAR_COMP(double3, !=) +DECLOP_3VAR_COMP(double3, <) +DECLOP_3VAR_COMP(double3, >) +DECLOP_3VAR_COMP(double3, <=) +DECLOP_3VAR_COMP(double3, >=) + +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(double3, float) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(double3, double) +DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(double3, signed long long) + +// DOUBLE4 + +DECLOP_4VAR_2IN_1OUT(double4, +) +DECLOP_4VAR_2IN_1OUT(double4, -) +DECLOP_4VAR_2IN_1OUT(double4, *) +DECLOP_4VAR_2IN_1OUT(double4, /) + +DECLOP_4VAR_ASSIGN(double4, +=) +DECLOP_4VAR_ASSIGN(double4, -=) +DECLOP_4VAR_ASSIGN(double4, *=) +DECLOP_4VAR_ASSIGN(double4, /=) + +DECLOP_4VAR_PREOP(double4, ++) +DECLOP_4VAR_PREOP(double4, --) + +DECLOP_4VAR_POSTOP(double4, ++) +DECLOP_4VAR_POSTOP(double4, --) + +DECLOP_4VAR_COMP(double4, ==) +DECLOP_4VAR_COMP(double4, !=) +DECLOP_4VAR_COMP(double4, <) +DECLOP_4VAR_COMP(double4, >) +DECLOP_4VAR_COMP(double4, <=) +DECLOP_4VAR_COMP(double4, >=) + +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(double4, float) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(double4, double) +DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(double4, signed long long) + +// UNSIGNED LONG1 + +DECLOP_1VAR_2IN_1OUT(ulong1, +) +DECLOP_1VAR_2IN_1OUT(ulong1, -) +DECLOP_1VAR_2IN_1OUT(ulong1, *) +DECLOP_1VAR_2IN_1OUT(ulong1, /) +DECLOP_1VAR_2IN_1OUT(ulong1, %) +DECLOP_1VAR_2IN_1OUT(ulong1, &) +DECLOP_1VAR_2IN_1OUT(ulong1, |) +DECLOP_1VAR_2IN_1OUT(ulong1, ^) +DECLOP_1VAR_2IN_1OUT(ulong1, <<) +DECLOP_1VAR_2IN_1OUT(ulong1, >>) + + +DECLOP_1VAR_ASSIGN(ulong1, +=) +DECLOP_1VAR_ASSIGN(ulong1, -=) +DECLOP_1VAR_ASSIGN(ulong1, *=) +DECLOP_1VAR_ASSIGN(ulong1, /=) +DECLOP_1VAR_ASSIGN(ulong1, %=) +DECLOP_1VAR_ASSIGN(ulong1, &=) +DECLOP_1VAR_ASSIGN(ulong1, |=) +DECLOP_1VAR_ASSIGN(ulong1, ^=) +DECLOP_1VAR_ASSIGN(ulong1, <<=) +DECLOP_1VAR_ASSIGN(ulong1, >>=) + +DECLOP_1VAR_PREOP(ulong1, ++) +DECLOP_1VAR_PREOP(ulong1, --) + +DECLOP_1VAR_POSTOP(ulong1, ++) +DECLOP_1VAR_POSTOP(ulong1, --) + +DECLOP_1VAR_COMP(ulong1, ==) +DECLOP_1VAR_COMP(ulong1, !=) +DECLOP_1VAR_COMP(ulong1, <) +DECLOP_1VAR_COMP(ulong1, >) +DECLOP_1VAR_COMP(ulong1, <=) +DECLOP_1VAR_COMP(ulong1, >=) + +DECLOP_1VAR_COMP(ulong1, &&) +DECLOP_1VAR_COMP(ulong1, ||) + +DECLOP_1VAR_1IN_1OUT(ulong1, ~) +DECLOP_1VAR_1IN_BOOLOUT(ulong1, !) + +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, float) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, double) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long long) + +// UNSIGNED LONG2 + +DECLOP_2VAR_2IN_1OUT(ulong2, +) +DECLOP_2VAR_2IN_1OUT(ulong2, -) +DECLOP_2VAR_2IN_1OUT(ulong2, *) +DECLOP_2VAR_2IN_1OUT(ulong2, /) +DECLOP_2VAR_2IN_1OUT(ulong2, %) +DECLOP_2VAR_2IN_1OUT(ulong2, &) +DECLOP_2VAR_2IN_1OUT(ulong2, |) +DECLOP_2VAR_2IN_1OUT(ulong2, ^) +DECLOP_2VAR_2IN_1OUT(ulong2, <<) +DECLOP_2VAR_2IN_1OUT(ulong2, >>) + +DECLOP_2VAR_ASSIGN(ulong2, +=) +DECLOP_2VAR_ASSIGN(ulong2, -=) +DECLOP_2VAR_ASSIGN(ulong2, *=) +DECLOP_2VAR_ASSIGN(ulong2, /=) +DECLOP_2VAR_ASSIGN(ulong2, %=) +DECLOP_2VAR_ASSIGN(ulong2, &=) +DECLOP_2VAR_ASSIGN(ulong2, |=) +DECLOP_2VAR_ASSIGN(ulong2, ^=) +DECLOP_2VAR_ASSIGN(ulong2, <<=) +DECLOP_2VAR_ASSIGN(ulong2, >>=) + +DECLOP_2VAR_PREOP(ulong2, ++) +DECLOP_2VAR_PREOP(ulong2, --) + +DECLOP_2VAR_POSTOP(ulong2, ++) +DECLOP_2VAR_POSTOP(ulong2, --) + +DECLOP_2VAR_COMP(ulong2, ==) +DECLOP_2VAR_COMP(ulong2, !=) +DECLOP_2VAR_COMP(ulong2, <) +DECLOP_2VAR_COMP(ulong2, >) +DECLOP_2VAR_COMP(ulong2, <=) +DECLOP_2VAR_COMP(ulong2, >=) + +DECLOP_2VAR_COMP(ulong2, &&) +DECLOP_2VAR_COMP(ulong2, ||) + +DECLOP_2VAR_1IN_1OUT(ulong2, ~) +DECLOP_2VAR_1IN_BOOLOUT(ulong2, !) + +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, float) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, double) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long long) + +// UNSIGNED LONG3 + +DECLOP_3VAR_2IN_1OUT(ulong3, +) +DECLOP_3VAR_2IN_1OUT(ulong3, -) +DECLOP_3VAR_2IN_1OUT(ulong3, *) +DECLOP_3VAR_2IN_1OUT(ulong3, /) +DECLOP_3VAR_2IN_1OUT(ulong3, %) +DECLOP_3VAR_2IN_1OUT(ulong3, &) +DECLOP_3VAR_2IN_1OUT(ulong3, |) +DECLOP_3VAR_2IN_1OUT(ulong3, ^) +DECLOP_3VAR_2IN_1OUT(ulong3, <<) +DECLOP_3VAR_2IN_1OUT(ulong3, >>) + +DECLOP_3VAR_ASSIGN(ulong3, +=) +DECLOP_3VAR_ASSIGN(ulong3, -=) +DECLOP_3VAR_ASSIGN(ulong3, *=) +DECLOP_3VAR_ASSIGN(ulong3, /=) +DECLOP_3VAR_ASSIGN(ulong3, %=) +DECLOP_3VAR_ASSIGN(ulong3, &=) +DECLOP_3VAR_ASSIGN(ulong3, |=) +DECLOP_3VAR_ASSIGN(ulong3, ^=) +DECLOP_3VAR_ASSIGN(ulong3, <<=) +DECLOP_3VAR_ASSIGN(ulong3, >>=) + +DECLOP_3VAR_PREOP(ulong3, ++) +DECLOP_3VAR_PREOP(ulong3, --) + +DECLOP_3VAR_POSTOP(ulong3, ++) +DECLOP_3VAR_POSTOP(ulong3, --) + +DECLOP_3VAR_COMP(ulong3, ==) +DECLOP_3VAR_COMP(ulong3, !=) +DECLOP_3VAR_COMP(ulong3, <) +DECLOP_3VAR_COMP(ulong3, >) +DECLOP_3VAR_COMP(ulong3, <=) +DECLOP_3VAR_COMP(ulong3, >=) + +DECLOP_3VAR_COMP(ulong3, &&) +DECLOP_3VAR_COMP(ulong3, ||) + +DECLOP_3VAR_1IN_1OUT(ulong3, ~) +DECLOP_3VAR_1IN_BOOLOUT(ulong3, !) + +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, float) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, double) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long long) + +// UNSIGNED LONG4 + +DECLOP_4VAR_2IN_1OUT(ulong4, +) +DECLOP_4VAR_2IN_1OUT(ulong4, -) +DECLOP_4VAR_2IN_1OUT(ulong4, *) +DECLOP_4VAR_2IN_1OUT(ulong4, /) +DECLOP_4VAR_2IN_1OUT(ulong4, %) +DECLOP_4VAR_2IN_1OUT(ulong4, &) +DECLOP_4VAR_2IN_1OUT(ulong4, |) +DECLOP_4VAR_2IN_1OUT(ulong4, ^) +DECLOP_4VAR_2IN_1OUT(ulong4, <<) +DECLOP_4VAR_2IN_1OUT(ulong4, >>) + +DECLOP_4VAR_ASSIGN(ulong4, +=) +DECLOP_4VAR_ASSIGN(ulong4, -=) +DECLOP_4VAR_ASSIGN(ulong4, *=) +DECLOP_4VAR_ASSIGN(ulong4, /=) +DECLOP_4VAR_ASSIGN(ulong4, %=) +DECLOP_4VAR_ASSIGN(ulong4, &=) +DECLOP_4VAR_ASSIGN(ulong4, |=) +DECLOP_4VAR_ASSIGN(ulong4, ^=) +DECLOP_4VAR_ASSIGN(ulong4, <<=) +DECLOP_4VAR_ASSIGN(ulong4, >>=) + +DECLOP_4VAR_PREOP(ulong4, ++) +DECLOP_4VAR_PREOP(ulong4, --) + +DECLOP_4VAR_POSTOP(ulong4, ++) +DECLOP_4VAR_POSTOP(ulong4, --) + +DECLOP_4VAR_COMP(ulong4, ==) +DECLOP_4VAR_COMP(ulong4, !=) +DECLOP_4VAR_COMP(ulong4, <) +DECLOP_4VAR_COMP(ulong4, >) +DECLOP_4VAR_COMP(ulong4, <=) +DECLOP_4VAR_COMP(ulong4, >=) + +DECLOP_4VAR_COMP(ulong4, &&) +DECLOP_4VAR_COMP(ulong4, ||) + +DECLOP_4VAR_1IN_1OUT(ulong4, ~) +DECLOP_4VAR_1IN_BOOLOUT(ulong4, !) + +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, float) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, double) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long long) + +// SIGNED LONG1 + +DECLOP_1VAR_2IN_1OUT(long1, +) +DECLOP_1VAR_2IN_1OUT(long1, -) +DECLOP_1VAR_2IN_1OUT(long1, *) +DECLOP_1VAR_2IN_1OUT(long1, /) +DECLOP_1VAR_2IN_1OUT(long1, %) +DECLOP_1VAR_2IN_1OUT(long1, &) +DECLOP_1VAR_2IN_1OUT(long1, |) +DECLOP_1VAR_2IN_1OUT(long1, ^) +DECLOP_1VAR_2IN_1OUT(long1, <<) +DECLOP_1VAR_2IN_1OUT(long1, >>) + + +DECLOP_1VAR_ASSIGN(long1, +=) +DECLOP_1VAR_ASSIGN(long1, -=) +DECLOP_1VAR_ASSIGN(long1, *=) +DECLOP_1VAR_ASSIGN(long1, /=) +DECLOP_1VAR_ASSIGN(long1, %=) +DECLOP_1VAR_ASSIGN(long1, &=) +DECLOP_1VAR_ASSIGN(long1, |=) +DECLOP_1VAR_ASSIGN(long1, ^=) +DECLOP_1VAR_ASSIGN(long1, <<=) +DECLOP_1VAR_ASSIGN(long1, >>=) + +DECLOP_1VAR_PREOP(long1, ++) +DECLOP_1VAR_PREOP(long1, --) + +DECLOP_1VAR_POSTOP(long1, ++) +DECLOP_1VAR_POSTOP(long1, --) + +DECLOP_1VAR_COMP(long1, ==) +DECLOP_1VAR_COMP(long1, !=) +DECLOP_1VAR_COMP(long1, <) +DECLOP_1VAR_COMP(long1, >) +DECLOP_1VAR_COMP(long1, <=) +DECLOP_1VAR_COMP(long1, >=) + +DECLOP_1VAR_COMP(long1, &&) +DECLOP_1VAR_COMP(long1, ||) + +DECLOP_1VAR_1IN_1OUT(long1, ~) +DECLOP_1VAR_1IN_BOOLOUT(long1, !) + +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(long1, float) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(long1, double) +DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(long1, signed long long) + +// SIGNED LONG2 + +DECLOP_2VAR_2IN_1OUT(long2, +) +DECLOP_2VAR_2IN_1OUT(long2, -) +DECLOP_2VAR_2IN_1OUT(long2, *) +DECLOP_2VAR_2IN_1OUT(long2, /) +DECLOP_2VAR_2IN_1OUT(long2, %) +DECLOP_2VAR_2IN_1OUT(long2, &) +DECLOP_2VAR_2IN_1OUT(long2, |) +DECLOP_2VAR_2IN_1OUT(long2, ^) +DECLOP_2VAR_2IN_1OUT(long2, <<) +DECLOP_2VAR_2IN_1OUT(long2, >>) + +DECLOP_2VAR_ASSIGN(long2, +=) +DECLOP_2VAR_ASSIGN(long2, -=) +DECLOP_2VAR_ASSIGN(long2, *=) +DECLOP_2VAR_ASSIGN(long2, /=) +DECLOP_2VAR_ASSIGN(long2, %=) +DECLOP_2VAR_ASSIGN(long2, &=) +DECLOP_2VAR_ASSIGN(long2, |=) +DECLOP_2VAR_ASSIGN(long2, ^=) +DECLOP_2VAR_ASSIGN(long2, <<=) +DECLOP_2VAR_ASSIGN(long2, >>=) + +DECLOP_2VAR_PREOP(long2, ++) +DECLOP_2VAR_PREOP(long2, --) + +DECLOP_2VAR_POSTOP(long2, ++) +DECLOP_2VAR_POSTOP(long2, --) + +DECLOP_2VAR_COMP(long2, ==) +DECLOP_2VAR_COMP(long2, !=) +DECLOP_2VAR_COMP(long2, <) +DECLOP_2VAR_COMP(long2, >) +DECLOP_2VAR_COMP(long2, <=) +DECLOP_2VAR_COMP(long2, >=) + +DECLOP_2VAR_COMP(long2, &&) +DECLOP_2VAR_COMP(long2, ||) + +DECLOP_2VAR_1IN_1OUT(long2, ~) +DECLOP_2VAR_1IN_BOOLOUT(long2, !) + +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(long2, float) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(long2, double) +DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(long2, signed long long) + +// SIGNED LONG3 + +DECLOP_3VAR_2IN_1OUT(long3, +) +DECLOP_3VAR_2IN_1OUT(long3, -) +DECLOP_3VAR_2IN_1OUT(long3, *) +DECLOP_3VAR_2IN_1OUT(long3, /) +DECLOP_3VAR_2IN_1OUT(long3, %) +DECLOP_3VAR_2IN_1OUT(long3, &) +DECLOP_3VAR_2IN_1OUT(long3, |) +DECLOP_3VAR_2IN_1OUT(long3, ^) +DECLOP_3VAR_2IN_1OUT(long3, <<) +DECLOP_3VAR_2IN_1OUT(long3, >>) + +DECLOP_3VAR_ASSIGN(long3, +=) +DECLOP_3VAR_ASSIGN(long3, -=) +DECLOP_3VAR_ASSIGN(long3, *=) +DECLOP_3VAR_ASSIGN(long3, /=) +DECLOP_3VAR_ASSIGN(long3, %=) +DECLOP_3VAR_ASSIGN(long3, &=) +DECLOP_3VAR_ASSIGN(long3, |=) +DECLOP_3VAR_ASSIGN(long3, ^=) +DECLOP_3VAR_ASSIGN(long3, <<=) +DECLOP_3VAR_ASSIGN(long3, >>=) + +DECLOP_3VAR_PREOP(long3, ++) +DECLOP_3VAR_PREOP(long3, --) + +DECLOP_3VAR_POSTOP(long3, ++) +DECLOP_3VAR_POSTOP(long3, --) + +DECLOP_3VAR_COMP(long3, ==) +DECLOP_3VAR_COMP(long3, !=) +DECLOP_3VAR_COMP(long3, <) +DECLOP_3VAR_COMP(long3, >) +DECLOP_3VAR_COMP(long3, <=) +DECLOP_3VAR_COMP(long3, >=) + +DECLOP_3VAR_COMP(long3, &&) +DECLOP_3VAR_COMP(long3, ||) + +DECLOP_3VAR_1IN_1OUT(long3, ~) +DECLOP_3VAR_1IN_BOOLOUT(long3, !) + +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(long3, float) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(long3, double) +DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(long3, signed long long) + +// SIGNED LONG4 + +DECLOP_4VAR_2IN_1OUT(long4, +) +DECLOP_4VAR_2IN_1OUT(long4, -) +DECLOP_4VAR_2IN_1OUT(long4, *) +DECLOP_4VAR_2IN_1OUT(long4, /) +DECLOP_4VAR_2IN_1OUT(long4, %) +DECLOP_4VAR_2IN_1OUT(long4, &) +DECLOP_4VAR_2IN_1OUT(long4, |) +DECLOP_4VAR_2IN_1OUT(long4, ^) +DECLOP_4VAR_2IN_1OUT(long4, <<) +DECLOP_4VAR_2IN_1OUT(long4, >>) + +DECLOP_4VAR_ASSIGN(long4, +=) +DECLOP_4VAR_ASSIGN(long4, -=) +DECLOP_4VAR_ASSIGN(long4, *=) +DECLOP_4VAR_ASSIGN(long4, /=) +DECLOP_4VAR_ASSIGN(long4, %=) +DECLOP_4VAR_ASSIGN(long4, &=) +DECLOP_4VAR_ASSIGN(long4, |=) +DECLOP_4VAR_ASSIGN(long4, ^=) +DECLOP_4VAR_ASSIGN(long4, <<=) +DECLOP_4VAR_ASSIGN(long4, >>=) + +DECLOP_4VAR_PREOP(long4, ++) +DECLOP_4VAR_PREOP(long4, --) + +DECLOP_4VAR_POSTOP(long4, ++) +DECLOP_4VAR_POSTOP(long4, --) + +DECLOP_4VAR_COMP(long4, ==) +DECLOP_4VAR_COMP(long4, !=) +DECLOP_4VAR_COMP(long4, <) +DECLOP_4VAR_COMP(long4, >) +DECLOP_4VAR_COMP(long4, <=) +DECLOP_4VAR_COMP(long4, >=) + +DECLOP_4VAR_COMP(long4, &&) +DECLOP_4VAR_COMP(long4, ||) + +DECLOP_4VAR_1IN_1OUT(long4, ~) +DECLOP_4VAR_1IN_BOOLOUT(long4, !) + +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(long4, float) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(long4, double) +DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(long4, signed long long) + +// UNSIGNED LONGLONG1 + +DECLOP_1VAR_2IN_1OUT(ulonglong1, +) +DECLOP_1VAR_2IN_1OUT(ulonglong1, -) +DECLOP_1VAR_2IN_1OUT(ulonglong1, *) +DECLOP_1VAR_2IN_1OUT(ulonglong1, /) +DECLOP_1VAR_2IN_1OUT(ulonglong1, %) +DECLOP_1VAR_2IN_1OUT(ulonglong1, &) +DECLOP_1VAR_2IN_1OUT(ulonglong1, |) +DECLOP_1VAR_2IN_1OUT(ulonglong1, ^) +DECLOP_1VAR_2IN_1OUT(ulonglong1, <<) +DECLOP_1VAR_2IN_1OUT(ulonglong1, >>) + + +DECLOP_1VAR_ASSIGN(ulonglong1, +=) +DECLOP_1VAR_ASSIGN(ulonglong1, -=) +DECLOP_1VAR_ASSIGN(ulonglong1, *=) +DECLOP_1VAR_ASSIGN(ulonglong1, /=) +DECLOP_1VAR_ASSIGN(ulonglong1, %=) +DECLOP_1VAR_ASSIGN(ulonglong1, &=) +DECLOP_1VAR_ASSIGN(ulonglong1, |=) +DECLOP_1VAR_ASSIGN(ulonglong1, ^=) +DECLOP_1VAR_ASSIGN(ulonglong1, <<=) +DECLOP_1VAR_ASSIGN(ulonglong1, >>=) + +DECLOP_1VAR_PREOP(ulonglong1, ++) +DECLOP_1VAR_PREOP(ulonglong1, --) + +DECLOP_1VAR_POSTOP(ulonglong1, ++) +DECLOP_1VAR_POSTOP(ulonglong1, --) + +DECLOP_1VAR_COMP(ulonglong1, ==) +DECLOP_1VAR_COMP(ulonglong1, !=) +DECLOP_1VAR_COMP(ulonglong1, <) +DECLOP_1VAR_COMP(ulonglong1, >) +DECLOP_1VAR_COMP(ulonglong1, <=) +DECLOP_1VAR_COMP(ulonglong1, >=) + +DECLOP_1VAR_COMP(ulonglong1, &&) +DECLOP_1VAR_COMP(ulonglong1, ||) + +DECLOP_1VAR_1IN_1OUT(ulonglong1, ~) +DECLOP_1VAR_1IN_BOOLOUT(ulonglong1, !) + +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, float) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, double) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long long) + +// UNSIGNED LONGLONG2 + +DECLOP_2VAR_2IN_1OUT(ulonglong2, +) +DECLOP_2VAR_2IN_1OUT(ulonglong2, -) +DECLOP_2VAR_2IN_1OUT(ulonglong2, *) +DECLOP_2VAR_2IN_1OUT(ulonglong2, /) +DECLOP_2VAR_2IN_1OUT(ulonglong2, %) +DECLOP_2VAR_2IN_1OUT(ulonglong2, &) +DECLOP_2VAR_2IN_1OUT(ulonglong2, |) +DECLOP_2VAR_2IN_1OUT(ulonglong2, ^) +DECLOP_2VAR_2IN_1OUT(ulonglong2, <<) +DECLOP_2VAR_2IN_1OUT(ulonglong2, >>) + +DECLOP_2VAR_ASSIGN(ulonglong2, +=) +DECLOP_2VAR_ASSIGN(ulonglong2, -=) +DECLOP_2VAR_ASSIGN(ulonglong2, *=) +DECLOP_2VAR_ASSIGN(ulonglong2, /=) +DECLOP_2VAR_ASSIGN(ulonglong2, %=) +DECLOP_2VAR_ASSIGN(ulonglong2, &=) +DECLOP_2VAR_ASSIGN(ulonglong2, |=) +DECLOP_2VAR_ASSIGN(ulonglong2, ^=) +DECLOP_2VAR_ASSIGN(ulonglong2, <<=) +DECLOP_2VAR_ASSIGN(ulonglong2, >>=) + +DECLOP_2VAR_PREOP(ulonglong2, ++) +DECLOP_2VAR_PREOP(ulonglong2, --) + +DECLOP_2VAR_POSTOP(ulonglong2, ++) +DECLOP_2VAR_POSTOP(ulonglong2, --) + +DECLOP_2VAR_COMP(ulonglong2, ==) +DECLOP_2VAR_COMP(ulonglong2, !=) +DECLOP_2VAR_COMP(ulonglong2, <) +DECLOP_2VAR_COMP(ulonglong2, >) +DECLOP_2VAR_COMP(ulonglong2, <=) +DECLOP_2VAR_COMP(ulonglong2, >=) + +DECLOP_2VAR_COMP(ulonglong2, &&) +DECLOP_2VAR_COMP(ulonglong2, ||) + +DECLOP_2VAR_1IN_1OUT(ulonglong2, ~) +DECLOP_2VAR_1IN_BOOLOUT(ulonglong2, !) + +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, float) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, double) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long long) + +// UNSIGNED LONGLONG3 + +DECLOP_3VAR_2IN_1OUT(ulonglong3, +) +DECLOP_3VAR_2IN_1OUT(ulonglong3, -) +DECLOP_3VAR_2IN_1OUT(ulonglong3, *) +DECLOP_3VAR_2IN_1OUT(ulonglong3, /) +DECLOP_3VAR_2IN_1OUT(ulonglong3, %) +DECLOP_3VAR_2IN_1OUT(ulonglong3, &) +DECLOP_3VAR_2IN_1OUT(ulonglong3, |) +DECLOP_3VAR_2IN_1OUT(ulonglong3, ^) +DECLOP_3VAR_2IN_1OUT(ulonglong3, <<) +DECLOP_3VAR_2IN_1OUT(ulonglong3, >>) + +DECLOP_3VAR_ASSIGN(ulonglong3, +=) +DECLOP_3VAR_ASSIGN(ulonglong3, -=) +DECLOP_3VAR_ASSIGN(ulonglong3, *=) +DECLOP_3VAR_ASSIGN(ulonglong3, /=) +DECLOP_3VAR_ASSIGN(ulonglong3, %=) +DECLOP_3VAR_ASSIGN(ulonglong3, &=) +DECLOP_3VAR_ASSIGN(ulonglong3, |=) +DECLOP_3VAR_ASSIGN(ulonglong3, ^=) +DECLOP_3VAR_ASSIGN(ulonglong3, <<=) +DECLOP_3VAR_ASSIGN(ulonglong3, >>=) + +DECLOP_3VAR_PREOP(ulonglong3, ++) +DECLOP_3VAR_PREOP(ulonglong3, --) + +DECLOP_3VAR_POSTOP(ulonglong3, ++) +DECLOP_3VAR_POSTOP(ulonglong3, --) + +DECLOP_3VAR_COMP(ulonglong3, ==) +DECLOP_3VAR_COMP(ulonglong3, !=) +DECLOP_3VAR_COMP(ulonglong3, <) +DECLOP_3VAR_COMP(ulonglong3, >) +DECLOP_3VAR_COMP(ulonglong3, <=) +DECLOP_3VAR_COMP(ulonglong3, >=) + +DECLOP_3VAR_COMP(ulonglong3, &&) +DECLOP_3VAR_COMP(ulonglong3, ||) + +DECLOP_3VAR_1IN_1OUT(ulonglong3, ~) +DECLOP_3VAR_1IN_BOOLOUT(ulonglong3, !) + +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, float) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, double) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long long) + +// UNSIGNED LONGLONG4 + +DECLOP_4VAR_2IN_1OUT(ulonglong4, +) +DECLOP_4VAR_2IN_1OUT(ulonglong4, -) +DECLOP_4VAR_2IN_1OUT(ulonglong4, *) +DECLOP_4VAR_2IN_1OUT(ulonglong4, /) +DECLOP_4VAR_2IN_1OUT(ulonglong4, %) +DECLOP_4VAR_2IN_1OUT(ulonglong4, &) +DECLOP_4VAR_2IN_1OUT(ulonglong4, |) +DECLOP_4VAR_2IN_1OUT(ulonglong4, ^) +DECLOP_4VAR_2IN_1OUT(ulonglong4, <<) +DECLOP_4VAR_2IN_1OUT(ulonglong4, >>) + +DECLOP_4VAR_ASSIGN(ulonglong4, +=) +DECLOP_4VAR_ASSIGN(ulonglong4, -=) +DECLOP_4VAR_ASSIGN(ulonglong4, *=) +DECLOP_4VAR_ASSIGN(ulonglong4, /=) +DECLOP_4VAR_ASSIGN(ulonglong4, %=) +DECLOP_4VAR_ASSIGN(ulonglong4, &=) +DECLOP_4VAR_ASSIGN(ulonglong4, |=) +DECLOP_4VAR_ASSIGN(ulonglong4, ^=) +DECLOP_4VAR_ASSIGN(ulonglong4, <<=) +DECLOP_4VAR_ASSIGN(ulonglong4, >>=) + +DECLOP_4VAR_PREOP(ulonglong4, ++) +DECLOP_4VAR_PREOP(ulonglong4, --) + +DECLOP_4VAR_POSTOP(ulonglong4, ++) +DECLOP_4VAR_POSTOP(ulonglong4, --) + +DECLOP_4VAR_COMP(ulonglong4, ==) +DECLOP_4VAR_COMP(ulonglong4, !=) +DECLOP_4VAR_COMP(ulonglong4, <) +DECLOP_4VAR_COMP(ulonglong4, >) +DECLOP_4VAR_COMP(ulonglong4, <=) +DECLOP_4VAR_COMP(ulonglong4, >=) + +DECLOP_4VAR_COMP(ulonglong4, &&) +DECLOP_4VAR_COMP(ulonglong4, ||) + +DECLOP_4VAR_1IN_1OUT(ulonglong4, ~) +DECLOP_4VAR_1IN_BOOLOUT(ulonglong4, !) + +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, float) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, double) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long long) + +// SIGNED LONGLONG1 + +DECLOP_1VAR_2IN_1OUT(longlong1, +) +DECLOP_1VAR_2IN_1OUT(longlong1, -) +DECLOP_1VAR_2IN_1OUT(longlong1, *) +DECLOP_1VAR_2IN_1OUT(longlong1, /) +DECLOP_1VAR_2IN_1OUT(longlong1, %) +DECLOP_1VAR_2IN_1OUT(longlong1, &) +DECLOP_1VAR_2IN_1OUT(longlong1, |) +DECLOP_1VAR_2IN_1OUT(longlong1, ^) +DECLOP_1VAR_2IN_1OUT(longlong1, <<) +DECLOP_1VAR_2IN_1OUT(longlong1, >>) + + +DECLOP_1VAR_ASSIGN(longlong1, +=) +DECLOP_1VAR_ASSIGN(longlong1, -=) +DECLOP_1VAR_ASSIGN(longlong1, *=) +DECLOP_1VAR_ASSIGN(longlong1, /=) +DECLOP_1VAR_ASSIGN(longlong1, %=) +DECLOP_1VAR_ASSIGN(longlong1, &=) +DECLOP_1VAR_ASSIGN(longlong1, |=) +DECLOP_1VAR_ASSIGN(longlong1, ^=) +DECLOP_1VAR_ASSIGN(longlong1, <<=) +DECLOP_1VAR_ASSIGN(longlong1, >>=) + +DECLOP_1VAR_PREOP(longlong1, ++) +DECLOP_1VAR_PREOP(longlong1, --) + +DECLOP_1VAR_POSTOP(longlong1, ++) +DECLOP_1VAR_POSTOP(longlong1, --) + +DECLOP_1VAR_COMP(longlong1, ==) +DECLOP_1VAR_COMP(longlong1, !=) +DECLOP_1VAR_COMP(longlong1, <) +DECLOP_1VAR_COMP(longlong1, >) +DECLOP_1VAR_COMP(longlong1, <=) +DECLOP_1VAR_COMP(longlong1, >=) + +DECLOP_1VAR_COMP(longlong1, &&) +DECLOP_1VAR_COMP(longlong1, ||) + +DECLOP_1VAR_1IN_1OUT(longlong1, ~) +DECLOP_1VAR_1IN_BOOLOUT(longlong1, !) + +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned char) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed char) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned short) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed short) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned int) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed int) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, float) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, double) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long long) +DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long long) + +// SIGNED LONGLONG2 + +DECLOP_2VAR_2IN_1OUT(longlong2, +) +DECLOP_2VAR_2IN_1OUT(longlong2, -) +DECLOP_2VAR_2IN_1OUT(longlong2, *) +DECLOP_2VAR_2IN_1OUT(longlong2, /) +DECLOP_2VAR_2IN_1OUT(longlong2, %) +DECLOP_2VAR_2IN_1OUT(longlong2, &) +DECLOP_2VAR_2IN_1OUT(longlong2, |) +DECLOP_2VAR_2IN_1OUT(longlong2, ^) +DECLOP_2VAR_2IN_1OUT(longlong2, <<) +DECLOP_2VAR_2IN_1OUT(longlong2, >>) + +DECLOP_2VAR_ASSIGN(longlong2, +=) +DECLOP_2VAR_ASSIGN(longlong2, -=) +DECLOP_2VAR_ASSIGN(longlong2, *=) +DECLOP_2VAR_ASSIGN(longlong2, /=) +DECLOP_2VAR_ASSIGN(longlong2, %=) +DECLOP_2VAR_ASSIGN(longlong2, &=) +DECLOP_2VAR_ASSIGN(longlong2, |=) +DECLOP_2VAR_ASSIGN(longlong2, ^=) +DECLOP_2VAR_ASSIGN(longlong2, <<=) +DECLOP_2VAR_ASSIGN(longlong2, >>=) + +DECLOP_2VAR_PREOP(longlong2, ++) +DECLOP_2VAR_PREOP(longlong2, --) + +DECLOP_2VAR_POSTOP(longlong2, ++) +DECLOP_2VAR_POSTOP(longlong2, --) + +DECLOP_2VAR_COMP(longlong2, ==) +DECLOP_2VAR_COMP(longlong2, !=) +DECLOP_2VAR_COMP(longlong2, <) +DECLOP_2VAR_COMP(longlong2, >) +DECLOP_2VAR_COMP(longlong2, <=) +DECLOP_2VAR_COMP(longlong2, >=) + +DECLOP_2VAR_COMP(longlong2, &&) +DECLOP_2VAR_COMP(longlong2, ||) + +DECLOP_2VAR_1IN_1OUT(longlong2, ~) +DECLOP_2VAR_1IN_BOOLOUT(longlong2, !) + +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned char) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed char) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned short) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed short) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned int) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed int) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, float) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, double) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long long) +DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long long) + +// SIGNED LONGLONG3 + +DECLOP_3VAR_2IN_1OUT(longlong3, +) +DECLOP_3VAR_2IN_1OUT(longlong3, -) +DECLOP_3VAR_2IN_1OUT(longlong3, *) +DECLOP_3VAR_2IN_1OUT(longlong3, /) +DECLOP_3VAR_2IN_1OUT(longlong3, %) +DECLOP_3VAR_2IN_1OUT(longlong3, &) +DECLOP_3VAR_2IN_1OUT(longlong3, |) +DECLOP_3VAR_2IN_1OUT(longlong3, ^) +DECLOP_3VAR_2IN_1OUT(longlong3, <<) +DECLOP_3VAR_2IN_1OUT(longlong3, >>) + +DECLOP_3VAR_ASSIGN(longlong3, +=) +DECLOP_3VAR_ASSIGN(longlong3, -=) +DECLOP_3VAR_ASSIGN(longlong3, *=) +DECLOP_3VAR_ASSIGN(longlong3, /=) +DECLOP_3VAR_ASSIGN(longlong3, %=) +DECLOP_3VAR_ASSIGN(longlong3, &=) +DECLOP_3VAR_ASSIGN(longlong3, |=) +DECLOP_3VAR_ASSIGN(longlong3, ^=) +DECLOP_3VAR_ASSIGN(longlong3, <<=) +DECLOP_3VAR_ASSIGN(longlong3, >>=) + +DECLOP_3VAR_PREOP(longlong3, ++) +DECLOP_3VAR_PREOP(longlong3, --) + +DECLOP_3VAR_POSTOP(longlong3, ++) +DECLOP_3VAR_POSTOP(longlong3, --) + +DECLOP_3VAR_COMP(longlong3, ==) +DECLOP_3VAR_COMP(longlong3, !=) +DECLOP_3VAR_COMP(longlong3, <) +DECLOP_3VAR_COMP(longlong3, >) +DECLOP_3VAR_COMP(longlong3, <=) +DECLOP_3VAR_COMP(longlong3, >=) + +DECLOP_3VAR_COMP(longlong3, &&) +DECLOP_3VAR_COMP(longlong3, ||) + +DECLOP_3VAR_1IN_1OUT(longlong3, ~) +DECLOP_3VAR_1IN_BOOLOUT(longlong3, !) + +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned char) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed char) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned short) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed short) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned int) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed int) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, float) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, double) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long long) +DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long long) + +// SIGNED LONGLONG4 + +DECLOP_4VAR_2IN_1OUT(longlong4, +) +DECLOP_4VAR_2IN_1OUT(longlong4, -) +DECLOP_4VAR_2IN_1OUT(longlong4, *) +DECLOP_4VAR_2IN_1OUT(longlong4, /) +DECLOP_4VAR_2IN_1OUT(longlong4, %) +DECLOP_4VAR_2IN_1OUT(longlong4, &) +DECLOP_4VAR_2IN_1OUT(longlong4, |) +DECLOP_4VAR_2IN_1OUT(longlong4, ^) +DECLOP_4VAR_2IN_1OUT(longlong4, <<) +DECLOP_4VAR_2IN_1OUT(longlong4, >>) + +DECLOP_4VAR_ASSIGN(longlong4, +=) +DECLOP_4VAR_ASSIGN(longlong4, -=) +DECLOP_4VAR_ASSIGN(longlong4, *=) +DECLOP_4VAR_ASSIGN(longlong4, /=) +DECLOP_4VAR_ASSIGN(longlong4, %=) +DECLOP_4VAR_ASSIGN(longlong4, &=) +DECLOP_4VAR_ASSIGN(longlong4, |=) +DECLOP_4VAR_ASSIGN(longlong4, ^=) +DECLOP_4VAR_ASSIGN(longlong4, <<=) +DECLOP_4VAR_ASSIGN(longlong4, >>=) + +DECLOP_4VAR_PREOP(longlong4, ++) +DECLOP_4VAR_PREOP(longlong4, --) + +DECLOP_4VAR_POSTOP(longlong4, ++) +DECLOP_4VAR_POSTOP(longlong4, --) + +DECLOP_4VAR_COMP(longlong4, ==) +DECLOP_4VAR_COMP(longlong4, !=) +DECLOP_4VAR_COMP(longlong4, <) +DECLOP_4VAR_COMP(longlong4, >) +DECLOP_4VAR_COMP(longlong4, <=) +DECLOP_4VAR_COMP(longlong4, >=) + +DECLOP_4VAR_COMP(longlong4, &&) +DECLOP_4VAR_COMP(longlong4, ||) + +DECLOP_4VAR_1IN_1OUT(longlong4, ~) +DECLOP_4VAR_1IN_BOOLOUT(longlong4, !) + +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned char) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed char) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned short) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed short) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned int) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed int) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, float) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, double) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long long) +DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long long) #endif From 052f630bd332e01f952ddb2d1796fafcabdad2f6 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 19 May 2017 17:22:14 +0300 Subject: [PATCH 093/171] [HIP] [HIPIFY] [FIX] cuModuleLoadDataEx -> hipModuleLoadDataEx https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/81 1. Do not use JIT options on HCC path, call hipModuleLoadData instead. 2. NVCC path is unchanged, to call cuModuleLoadDataEx with all options. 3. Get rid of manual hipification, based on #ifdef #else for NVCC/HIP. 4. Update documentation accordingly. [ROCm/clr commit: ae9f14ef9c7239455d2e9e3eb6f0dfc6e0afb25d] --- .../docs/markdown/hip_porting_driver_api.md | 65 +++++++++++-------- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 38 +++++------ .../include/hip/hcc_detail/hip_runtime_api.h | 19 ++++-- .../clr/hipamd/include/hip/hip_runtime_api.h | 24 +++++++ .../include/hip/nvcc_detail/hip_runtime_api.h | 44 ++++++++++--- projects/clr/hipamd/src/hip_module.cpp | 6 +- 6 files changed, 134 insertions(+), 62 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_porting_driver_api.md b/projects/clr/hipamd/docs/markdown/hip_porting_driver_api.md index dd3b9c3e86..0912e676cc 100644 --- a/projects/clr/hipamd/docs/markdown/hip_porting_driver_api.md +++ b/projects/clr/hipamd/docs/markdown/hip_porting_driver_api.md @@ -98,48 +98,57 @@ HIP/HCC will push primary context to context stack when it is empty. This can ha #### Interoperation between HIP and CUDA Driver CUDA applications may want to mix CUDA driver code with HIP code (see example below). This table shows the type equivalence to enable this interaction. -|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| -| ---- | ---- | ---- | -| hipModule | CUmodule | | -| hipFunction | CUfunction | | -| hipCtx_t | CUcontext | | -| hipDevice_t | CUdevice | | -| hipStream_t | CUstream | cudaStream_t | -| hipEvent_t | CUevent | cudaEvent_t | -| hipArray | CUarray | cudaArray | - -#### Compilation Flags -The hipModule interface does not support the `cuModuleLoadDataEx` function, which is used to control PTX compilation options. -HCC does not use PTX and does not support the same compilation options. -In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as part of the load step. -Code which requires this functionally should use platform-specific coding, calling `cuModuleLoadDataEx` on the NVCC path and `hipModuleLoadData` on the hcc path. -For example: +|**HIP Type** |**CU Driver Type**|**CUDA Runtime Type**| +| ---- | ---- | ---- | +| hipModule_t | CUmodule | | +| hipFunction_t | CUfunction | | +| hipCtx_t | CUcontext | | +| hipDevice_t | CUdevice | | +| hipStream_t | CUstream | cudaStream_t | +| hipEvent_t | CUevent | cudaEvent_t | +| hipArray | CUarray | cudaArray | +#### Compilation Options +The `hipModule_t` interface does not support `cuModuleLoadDataEx` function, which is used to control PTX compilation options. +HCC does not use PTX and does not support these compilation options. +In fact, HCC code objects always contain fully compiled ISA and do not require additional compilation as a part of the load step. +The corresponding HIP function `hipModuleLoadDataEx` behaves as `hipModuleLoadData` on HCC path (compilation options are not used) and as `cuModuleLoadDataEx` on NVCC path. +For example (CUDA): ``` -hipModule module; -void *imagePtr = ... ; // Somehow populate data pointer with code object +CUmodule module; +void *imagePtr = ...; // Somehow populate data pointer with code object -#ifdef __HIP_PLATFORM_NVCC__ -// Use CUDA driver API but write to hipModule since they are same type: const int numOptions = 1; CUJit_option options[numOptions]; void * optionValues[numOptions]; options[0] = CU_JIT_MAX_REGISTERS; -unsigned maxRegs=15; -optionValues[0] = (void*) (&maxRegs); +unsigned maxRegs = 15; +optionValues[0] = (void*)(&maxRegs); cuModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); -#else // __HIP_PLATFORM_HCC__ +CUfunction k; +cuModuleGetFunction(&k, module, "myKernel"); +``` +HIP: +``` +hipModule_t module; +void *imagePtr = ...; // Somehow populate data pointer with code object -// HCC path does not support or require JIT options, so just load the module. -hipModuleLoadData(&module, imagePtr); +const int numOptions = 1; +hipJitOption options[numOptions]; +void * optionValues[numOptions]; -#endif +options[0] = hipJitOptionMaxRegisters; +unsigned maxRegs = 15; +optionValues[0] = (void*)(&maxRegs); -// Back to unified code - both paths above loaded the "module" variable. -hipFunction k; +// hipModuleLoadData(module, imagePtr) will be called on HCC path, JIT options will not be used, and +// cupModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues) will be called on NVCC path +hipModuleLoadDataEx(module, imagePtr, numOptions, options, optionValues); + +hipFunction_t k; hipModuleGetFunction(&k, module, "myKernel"); ``` diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 0c6b0f1efc..e07baab3fd 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -678,24 +678,24 @@ struct cuda2hipMap { cuda2hipRename["CU_PREFER_PTX"] = {"hipJitFallbackPreferPtx", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_PREFER_BINARY"] = {"hipJitFallbackPreferBinary", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // enum CUjit_option/CUjit_option_enum - cuda2hipRename["CUjit_option"] = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) - cuda2hipRename["CUjit_option_enum"] = {"hipJitOption", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_MAX_REGISTERS"] = {"hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_THREADS_PER_BLOCK"] = {"hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_WALL_TIME"] = {"hipJitOptionWallTime", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_INFO_LOG_BUFFER"] = {"hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER"] = {"hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_OPTIMIZATION_LEVEL"] = {"hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_TARGET_FROM_CUCONTEXT"] = {"hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_TARGET"] = {"hipJitOptionTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_FALLBACK_STRATEGY"] = {"hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_GENERATE_DEBUG_INFO"] = {"hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUjit_option"] = {"hipJitOption", CONV_JIT, API_DRIVER}; // API_Runtime ANALOGUE (no) + cuda2hipRename["CUjit_option_enum"] = {"hipJitOption", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_MAX_REGISTERS"] = {"hipJitOptionMaxRegisters", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_THREADS_PER_BLOCK"] = {"hipJitOptionThreadsPerBlock", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_WALL_TIME"] = {"hipJitOptionWallTime", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_INFO_LOG_BUFFER"] = {"hipJitOptionInfoLogBuffer", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionInfoLogBufferSizeBytes", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER"] = {"hipJitOptionErrorLogBuffer", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES"] = {"hipJitOptionErrorLogBufferSizeBytes", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_OPTIMIZATION_LEVEL"] = {"hipJitOptionOptimizationLevel", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_TARGET_FROM_CUCONTEXT"] = {"hipJitOptionTargetFromContext", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_TARGET"] = {"hipJitOptionTarget", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_FALLBACK_STRATEGY"] = {"hipJitOptionFallbackStrategy", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_GENERATE_DEBUG_INFO"] = {"hipJitOptionGenerateDebugInfo", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER}; // enum CUjit_target/CUjit_target_enum cuda2hipRename["CUjit_target"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) cuda2hipRename["CUjit_target_enum"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; @@ -905,7 +905,7 @@ struct cuda2hipMap { cuda2hipRename["cuModuleLoad"] = {"hipModuleLoad", CONV_MODULE, API_DRIVER}; cuda2hipRename["cuModuleLoadData"] = {"hipModuleLoadData", CONV_MODULE, API_DRIVER}; // unsupported yet by HIP - cuda2hipRename["cuModuleLoadDataEx"] = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleLoadDataEx"] = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER}; cuda2hipRename["cuModuleLoadFatBinary"] = {"hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuModuleUnload"] = {"hipModuleUnload", CONV_MODULE, API_DRIVER}; diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index e1aecef1e8..34ed2ed5ce 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -1915,7 +1915,7 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con * @brief returns device memory pointer and size of the kernel present in the module with symbol @p name * * @param [out] dptr - * @param [out[ bytes + * @param [out] bytes * @param [in] hmod * @param [in] name * @@ -1923,7 +1923,6 @@ hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module, con */ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char *name); - /** * @brief builds module from code object which resides in host memory. Image is pointer to that location. * @@ -1934,11 +1933,23 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t h */ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); +/** +* @brief builds module from code object which resides in host memory. Image is pointer to that location. Options are not used. hipModuleLoadData is called. +* +* @param [in] image +* @param [out] module +* @param [in] number of options +* @param [in] options for JIT +* @param [in] option values for JIT +* +* @returns hipSuccess, hipErrorNotInitialized, hipErrorOutOfMemory, hipErrorNotInitialized +*/ +hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues); /** * @brief launches kernel f with launch parameters and shared memory on stream with arguments passed to kernelparams or extra * - * @param [in[ f Kernel to launch. + * @param [in] f Kernel to launch. * @param [in] gridDimX X grid dimension specified as multiple of blockDimX. * @param [in] gridDimY Y grid dimension specified as multiple of blockDimY. * @param [in] gridDimZ Z grid dimension specified as multiple of blockDimZ. @@ -1946,7 +1957,7 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image); * @param [in] blockDimY Y grid dimension specified in work-items * @param [in] blockDimZ Z grid dimension specified in work-items * @param [in] sharedMemBytes Amount of dynamic shared memory to allocate for this kernel. The kernel can access this with HIP_DYNAMIC_SHARED. - * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. + * @param [in] stream Stream where the kernel should be dispatched. May be 0, in which case th default stream is used with associated synchronization rules. * @param [in] kernelParams * @param [in] extra Pointer to kernel arguments. These are passed directly to the kernel and must be in the memory layout and alignment expected by the kernel. * diff --git a/projects/clr/hipamd/include/hip/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hip_runtime_api.h index 8eae1d6a3a..fa54dda5dc 100644 --- a/projects/clr/hipamd/include/hip/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hip_runtime_api.h @@ -250,6 +250,30 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. } hipDeviceAttribute_t; +/* +* @brief hipJitOption +* @enum +* @ingroup Enumerations +*/ +typedef enum hipJitOption { + hipJitOptionMaxRegisters = 0, + hipJitOptionThreadsPerBlock, + hipJitOptionWallTime, + hipJitOptionInfoLogBuffer, + hipJitOptionInfoLogBufferSizeBytes, + hipJitOptionErrorLogBuffer, + hipJitOptionErrorLogBufferSizeBytes, + hipJitOptionOptimizationLevel, + hipJitOptionTargetFromContext, + hipJitOptionTarget, + hipJitOptionFallbackStrategy, + hipJitOptionGenerateDebugInfo, + hipJitOptionLogVerbose, + hipJitOptionGenerateLineInfo, + hipJitOptionCacheMode, + hipJitOptionNumOptions +} hipJitOption; + /** * @} */ diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index 69a9b46570..01a93f7ba4 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -54,26 +54,44 @@ hipMemcpyHostToHost #define hipFilterModePoint cudaFilterModePoint //! Flags that can be used with hipEventCreateWithFlags: -#define hipEventDefault cudaEventDefault -#define hipEventBlockingSync cudaEventBlockingSync -#define hipEventDisableTiming cudaEventDisableTiming -#define hipEventInterprocess cudaEventInterprocess +#define hipEventDefault cudaEventDefault +#define hipEventBlockingSync cudaEventBlockingSync +#define hipEventDisableTiming cudaEventDisableTiming +#define hipEventInterprocess cudaEventInterprocess #define hipEventDisableSystemRelease cudaEventDefault /* no-op on CUDA platform */ -#define hipHostMallocDefault cudaHostAllocDefault -#define hipHostMallocPortable cudaHostAllocPortable -#define hipHostMallocMapped cudaHostAllocMapped +#define hipHostMallocDefault cudaHostAllocDefault +#define hipHostMallocPortable cudaHostAllocPortable +#define hipHostMallocMapped cudaHostAllocMapped #define hipHostMallocWriteCombined cudaHostAllocWriteCombined #define hipHostRegisterPortable cudaHostRegisterPortable -#define hipHostRegisterMapped cudaHostRegisterMapped +#define hipHostRegisterMapped cudaHostRegisterMapped #define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER -#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE +#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE #define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END #define hipLimitMallocHeapSize cudaLimitMallocHeapSize -#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess +#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess + +// enum CUjit_option redefines +#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS +#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK +#define hipJitOptionWallTime CU_JIT_WALL_TIME +#define hipJitOptionInfoLogBuffer CU_JIT_INFO_LOG_BUFFER +#define hipJitOptionInfoLogBufferSizeBytes CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES +#define hipJitOptionErrorLogBuffer CU_JIT_ERROR_LOG_BUFFER +#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES +#define hipJitOptionOptimizationLevel CU_JIT_OPTIMIZATION_LEVEL +#define hipJitOptionTargetFromContext CU_JIT_TARGET_FROM_CUCONTEXT +#define hipJitOptionTarget CU_JIT_TARGET +#define hipJitOptionFallbackStrategy CU_JIT_FALLBACK_STRATEGY +#define hipJitOptionGenerateDebugInfo CU_JIT_GENERATE_DEBUG_INFO +#define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE +#define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO +#define hipJitOptionCacheMode CU_JIT_CACHE_MODE +#define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS typedef cudaEvent_t hipEvent_t; typedef cudaStream_t hipStream_t; @@ -84,6 +102,7 @@ typedef cudaFuncCache hipFuncCache_t; typedef CUcontext hipCtx_t; typedef CUsharedconfig hipSharedMemConfig; typedef CUfunc_cache hipFuncCache; +typedef CUjit_option hipJitOption; typedef CUdevice hipDevice_t; typedef CUmodule hipModule_t; typedef CUfunction hipFunction_t; @@ -894,6 +913,11 @@ inline static hipError_t hipModuleLoadData(hipModule_t *module, const void *imag return hipCUResultTohipError(cuModuleLoadData(module, image)); } +inline static hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues) +{ + return hipCUResultTohipError(cuModuleLoadDataEx(module, image, numOptions, options, optionValues)); +} + inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ, diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index da01f23769..d364a6b519 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -525,7 +525,6 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, } } - hipError_t hipModuleLoadData(hipModule_t *module, const void *image) { HIP_INIT_API(module, image); @@ -575,3 +574,8 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image) } return ihipLogStatus(ret); } + +hipError_t hipModuleLoadDataEx(hipModule_t *module, const void *image, unsigned int numOptions, hipJitOption *options, void **optionValues) +{ + return hipModuleLoadData(module, image); +} From aebc80c8e0dfa220f56341c9daac0299e2e44a4f Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 19 May 2017 17:39:09 +0300 Subject: [PATCH 094/171] [HIPIFY] [FIX] [HIPIFY] Matcher for pointer to enum var declaration is missing. https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/79 Example from CUDA 8.0.44 sample (CUDASamples\0_Simple\matrixMulDrv\matrixMulDrv.cpp): CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; where CUjit_option is enum, should be: hipJitOption *jitOptions = new hipJitOption[jitNumOptions]; [TODO] 1. new CUjit_option -> new hipJitOption. Matcher for new operator is missing: https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/79 2. Merge matchers cudaEnumDecl and cudaEnumVarPtr. [ROCm/clr commit: 51b742035922f0e9c2bc694f1f7e5a41321a6c54] --- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 51 +++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index e07baab3fd..34d3d6e24f 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -2905,6 +2905,51 @@ private: return false; } + bool cudaEnumVarPtr(const MatchFinder::MatchResult &Result) { + if (const VarDecl *enumVarPtr = Result.Nodes.getNodeAs("cudaEnumVarPtr")) { + const Type *t = enumVarPtr->getType().getTypePtrOrNull(); + if (t) { + QualType QT = t->getPointeeType(); + std::string name = QT.getAsString(); + QT = enumVarPtr->getType().getUnqualifiedType(); + std::string name_unqualified = QT.getAsString(); + if ((name_unqualified.find(' ') == std::string::npos && name.find(' ') == std::string::npos) || name.empty()) { + name = name_unqualified; + } + // Workaround for enum VarDecl as param decl, declared with enum type specifier + // Example: void func(enum cudaMemcpyKind kind); + //------------------------------------------------- + SourceManager *SM = Result.SourceManager; + TypeLoc TL = enumVarPtr->getTypeSourceInfo()->getTypeLoc(); + SourceLocation sl(TL.getUnqualifiedLoc().getLocStart()); + SourceLocation end(TL.getUnqualifiedLoc().getLocEnd()); + size_t repLength = SM->getCharacterData(end) - SM->getCharacterData(sl); + StringRef sfull = StringRef(SM->getCharacterData(sl), repLength); + size_t offset = sfull.find(name); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + //------------------------------------------------- + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + updateCounters(found->second, name); + if (!found->second.unsupported) { + StringRef repName = found->second.hipName; + Replacement Rep(*SM, sl, name.size(), repName); + FullSourceLoc fullSL(sl, *SM); + insertReplacement(Rep, fullSL); + } + } + else { + std::string msg = "the following reference is not handled: '" + name + "' [enum var ptr]."; + printHipifyMessage(*SM, sl, msg); + } + } + return true; + } + return false; + } + bool cudaTypedefVar(const MatchFinder::MatchResult &Result) { if (const VarDecl *typedefVar = Result.Nodes.getNodeAs("cudaTypedefVar")) { QualType QT = typedefVar->getType(); @@ -3185,6 +3230,7 @@ public: if (cudaBuiltin(Result)) break; if (cudaEnumConstantRef(Result)) break; if (cudaEnumDecl(Result)) break; + if (cudaEnumVarPtr(Result)) break; if (cudaTypedefVar(Result)) break; if (cudaTypedefVarPtr(Result)) break; if (cudaStructVar(Result)) break; @@ -3232,6 +3278,11 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac hasType(enumDecl())) .bind("cudaEnumDecl"), Callback); + Finder.addMatcher(varDecl(isExpansionInMainFile(), + hasType(pointsTo(enumDecl( + matchesName("cu.*|CU.*"))))) + .bind("cudaEnumVarPtr"), + Callback); Finder.addMatcher(varDecl(isExpansionInMainFile(), hasType(typedefDecl(matchesName("cu.*|CU.*")))) .bind("cudaTypedefVar"), From f848e9f11724d36dc3c0565ab4c650a843f6da3e Mon Sep 17 00:00:00 2001 From: pensun Date: Mon, 22 May 2017 08:52:43 -0500 Subject: [PATCH 095/171] fix GGL helper header file, reorder for C++17 Change-Id: I3d9ddfe670bf7e3e8e7bd85e52cc61f48c19c213 [ROCm/clr commit: 2523f8a492fc6f247b12330757f2432818414b0e] --- projects/clr/hipamd/include/hip/hcc_detail/helpers.hpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/helpers.hpp b/projects/clr/hipamd/include/hip/hcc_detail/helpers.hpp index 611929766b..b5502c1efb 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/helpers.hpp +++ b/projects/clr/hipamd/include/hip/hcc_detail/helpers.hpp @@ -102,9 +102,6 @@ namespace hip_impl // Not callable. template struct is_callable_impl : std::false_type {}; - - template - struct is_callable : is_callable_impl {}; #else template struct is_callable_impl : std::false_type {}; @@ -114,6 +111,8 @@ namespace hip_impl F(Ts...), void_t_>> : std::true_type {}; #endif + template + struct is_callable : is_callable_impl {}; #define count_macro_args_impl_hip_(\ _0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15,\ From e7813b1933ecd0ffa59065800adced44d5f5ce79 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 23 May 2017 09:32:19 -0500 Subject: [PATCH 096/171] fixed erfinv build error as it is implemented in hcc Change-Id: I27a512147c53f658a63fdf3e90f5e9cfac09ada8 [ROCm/clr commit: 490355203b17c8494118abb7e749620397e4b5ae] --- projects/clr/hipamd/src/math_functions.cpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/projects/clr/hipamd/src/math_functions.cpp b/projects/clr/hipamd/src/math_functions.cpp index 3472216309..f1e52c4036 100644 --- a/projects/clr/hipamd/src/math_functions.cpp +++ b/projects/clr/hipamd/src/math_functions.cpp @@ -830,16 +830,6 @@ __host__ double erfcinv(double y) return __hip_host_erfcinv(y); } -__host__ float erfinvf(float x) -{ - return __hip_host_erfinvf(x); -} - -__host__ double erfinv(double x) -{ - return __hip_host_erfinv(x); -} - __host__ double fdivide(double x, double y) { return x/y; @@ -949,7 +939,7 @@ __host__ void sincospi(double x, double *sptr, double *cptr) __host__ float normcdfinvf(float x) { - return std::sqrt(2) * erfinv(2*x-1); + return std::sqrt(2) * erfinvf(2*x-1); } __host__ double normcdfinv(double x) From 9e7a50b1e000a38fab4f0fcdec674fe505ad10a3 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 23 May 2017 19:45:38 +0300 Subject: [PATCH 097/171] [FIX] [HIPIFY] Matcher for new operator is missing. https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/80 Example from CUDA 8.0.44 sample (CUDASamples\0_Simple\matrixMulDrv\matrixMulDrv.cpp): CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; where CUjit_option is enum, should be: hipJitOption *jitOptions = new hipJitOption[jitNumOptions]; [ROCm/clr commit: 21d74f09b9cf267663e7197860527ed9278b11ef] --- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 37 +++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 34d3d6e24f..dcb9c3d216 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -3095,6 +3095,35 @@ private: return false; } + bool cudaNewOperatorDecl(const MatchFinder::MatchResult &Result) { + if (const auto *newOperator = Result.Nodes.getNodeAs("cudaNewOperatorDecl")) { + const Type *t = newOperator->getType().getTypePtrOrNull(); + if (t) { + SourceManager *SM = Result.SourceManager; + TypeLoc TL = newOperator->getAllocatedTypeSourceInfo()->getTypeLoc(); + SourceLocation sl = TL.getUnqualifiedLoc().getLocStart(); + QualType QT = t->getPointeeType(); + std::string name = QT.getAsString(); + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + updateCounters(found->second, name); + if (!found->second.unsupported) { + StringRef repName = found->second.hipName; + Replacement Rep(*SM, sl, name.size(), repName); + FullSourceLoc fullSL(sl, *SM); + insertReplacement(Rep, fullSL); + } + } + else { + std::string msg = "the following reference is not handled: '" + name + "' [new operator]."; + printHipifyMessage(*SM, sl, msg); + } + } + } + return false; + } + + bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) { StringRef refName = "cudaSharedIncompleteArrayVar"; if (const VarDecl *sharedVar = Result.Nodes.getNodeAs(refName)) { @@ -3239,6 +3268,7 @@ public: if (cudaParamDecl(Result)) break; if (cudaParamDeclPtr(Result)) break; if (cudaLaunchKernel(Result)) break; + if (cudaNewOperatorDecl(Result)) break; if (cudaSharedIncompleteArrayVar(Result)) break; if (stringLiteral(Result)) break; if (unresolvedTemplateName(Result)) break; @@ -3336,6 +3366,13 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac hasType(incompleteArrayType()))) .bind("cudaSharedIncompleteArrayVar"), Callback); + // Example: + // CUjit_option *jitOptions = new CUjit_option[jitNumOptions]; + // hipJitOption *jitOptions = new hipJitOption[jitNumOptions]; + Finder.addMatcher(cxxNewExpr(isExpansionInMainFile(), + hasType(pointsTo(namedDecl(matchesName("cu.*|CU.*"))))) + .bind("cudaNewOperatorDecl"), + Callback); } int64_t printStats(const std::string &csvFile, const std::string &srcFile, From 0cde8e5db4123e8525bde36c9175f13b399f3ce6 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 22:59:22 -0500 Subject: [PATCH 098/171] Fix trace category for hipHostMalloc [ROCm/clr commit: ca07615c37635059099684bd36a62524d3d7d54a] --- projects/clr/hipamd/src/hip_memory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index fc2ada134e..3f95cd22b4 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -245,7 +245,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { - HIP_INIT_SPECIAL_API((TRACE_MCMD), ptr, sizeBytes, flags); + HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, sizeBytes, flags); HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; From 2e8625a208fb52ad5f593fe66219cd1c30287857 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 22:59:54 -0500 Subject: [PATCH 099/171] Use accelerator_scope for create_marker and create_blocking_marker. As optimization when system-scope is not needed. [ROCm/clr commit: 2d5b3359c644ba283813d46fb55fbd59e73b726a] --- projects/clr/hipamd/src/hip_hcc.cpp | 4 ++-- projects/clr/hipamd/src/math_functions.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 979a2e5028..efa05cbb93 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -352,7 +352,7 @@ void ihipStream_t::locked_waitEvent(hipEvent_t event) this->ensureHaveQueue(crit); - crit->_av.create_blocking_marker(event->_marker); + crit->_av.create_blocking_marker(event->_marker, hc::accelerator_scope); } // Create a marker in this stream. @@ -1490,7 +1490,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) // ensure any commands sent to this stream wait on the NULL stream before continuing LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); // TODO - could be "noret" version of create_blocking_marker - thisStreamCrit->_av.create_blocking_marker(dcf); + thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope); } } } diff --git a/projects/clr/hipamd/src/math_functions.cpp b/projects/clr/hipamd/src/math_functions.cpp index f1e52c4036..151627fc73 100644 --- a/projects/clr/hipamd/src/math_functions.cpp +++ b/projects/clr/hipamd/src/math_functions.cpp @@ -942,10 +942,10 @@ __host__ float normcdfinvf(float x) return std::sqrt(2) * erfinvf(2*x-1); } -__host__ double normcdfinv(double x) -{ - return std::sqrt(2) * erfinv(2*x-1); -} +//__host__ double normcdfinv(double x) +//{ +// return std::sqrt(2) * erfinv(2*x-1); +//} __host__ float nextafterf(float x, float y) { From 59e07db8650476bdfbfba8a77ae95fe772ffb458 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 23:14:38 -0500 Subject: [PATCH 100/171] Expand test to cover copy followed by event sync [ROCm/clr commit: 92bd54d7b31c801350392d39347953b8a3b77dfd] --- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 81 ++++++++++++++++--- 1 file changed, 70 insertions(+), 11 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index 80ff7ad98d..d12b07289b 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -41,8 +41,14 @@ unsigned p_count = 100; // Structure for one stream; template class Streamer { + +#define COMMAND_ADD_FORWARD 0 +#define COMMAND_ADD_REVERSE 1 +#define COMMAND_COPY 2 + + public: - Streamer(int deviceId, T *input, size_t numElements, bool reverse); + Streamer(int deviceId, T *input, size_t numElements, int commandType); ~Streamer(); void runAsyncAfter(Streamer *depStreamer, bool waitSameStream=false); void runAsyncWaitSameStream(); @@ -57,7 +63,11 @@ public: size_t mismatchCount() const { return _mismatchCount; }; T *C_d() { return _C_d; }; + // How much does this streamer add to A[i] after running runAsyncAfter + int expectedAdd() const { return (_commandType == COMMAND_COPY) ? 0 : p_count; }; + + int _commandType; // 0=addReverse, 1=addFwd, 2=move private: T *_C_h; @@ -71,22 +81,23 @@ private: int _deviceId; size_t _numElements; - bool _reverse; size_t _mismatchCount; }; template -Streamer::Streamer(int deviceId, T * A_d, size_t numElements, bool reverse) : +Streamer::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) : _preA_d(NULL), _A_d(A_d), _deviceId(deviceId), _numElements(numElements), - _reverse(reverse) + _commandType(commandType) { size_t sizeElements = numElements * sizeof(int); + //if (commandType == 0) _commandType = 1; // TODO - remove me + HIPCHECK(hipSetDevice(_deviceId)); @@ -115,6 +126,23 @@ Streamer::Streamer(int deviceId, T * A_d, size_t numElements, bool reverse) : }; +template +Streamer::~Streamer() +{ + HIPCHECK(hipSetDevice(_deviceId)); + + printf ("info: ~Streamer\n"); + if (_preA_d) { + HIPCHECK(hipFree(_preA_d)); + } + HIPCHECK(hipFree(_C_d)); + HIPCHECK(hipHostFree(_C_h)); + + HIPCHECK(hipStreamDestroy(_stream)); + HIPCHECK(hipEventDestroy(_event)); +} + + template void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) { @@ -134,10 +162,14 @@ void Streamer::runAsyncAfter(Streamer *depStreamer, bool waitSameStream) unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, _numElements); - if (_reverse) { + if (_commandType == COMMAND_ADD_REVERSE) { hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); - } else { + } else if (_commandType == COMMAND_ADD_FORWARD) { hipLaunchKernelGGL(HipTest::addCount, dim3(blocks), dim3(threadsPerBlock), 0, _stream, _A_d, _C_d, _numElements, p_count); + } else if (_commandType == COMMAND_COPY) { + HIPCHECK(hipMemcpyAsync(_C_d, _A_d, _numElements * sizeof(T), hipMemcpyDeviceToDevice, _stream)); + } else { + assert(0); // bad command type } HIPCHECK(hipEventRecord(_event, _stream)); @@ -263,9 +295,13 @@ void checkAll(int initValue, std::vector &streamers, std::vector< } + int expected = 0; // Check in forward order so we can find first mismatch: for (int i=0; icheck(i+1, initValue, (i+1)*p_count, expectPass); + + expected += streamers[i]->expectedAdd(); + + mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass); } if (!expectPass && (mismatchCount==0)) { @@ -305,7 +341,7 @@ void sync_allDevices(int numDevices) void sync_queryAllUntilComplete(std::vector streamers) { - for (int i=0; i=0; i--) { streamers[i]->queryUntilComplete(); }; } @@ -334,8 +370,6 @@ int main(int argc, char *argv[]) - std::vector streamers; - std::vector streamersDev0; // streamers for first device. size_t numElements = N; size_t sizeElements = numElements * sizeof(int); @@ -361,9 +395,13 @@ int main(int argc, char *argv[]) HIPCHECK(hipGetDeviceCount(&numDevices)); numDevices = min(2, numDevices); // multi-GPU to 2 device. + std::vector streamers; + std::vector streamersDev0; // streamers for first device. + for (int d=0; dC_d() : initArray_d, numElements, i&1 /*reverse?*/); + int command = (i%2) ? COMMAND_ADD_FORWARD : COMMAND_ADD_REVERSE; + IntStreamer * s = new IntStreamer(d, i ? streamers.back()->C_d() : initArray_d, numElements, command); streamers.push_back(s); if (d==0) { streamersDev0.push_back(s); @@ -371,6 +409,10 @@ int main(int argc, char *argv[]) } } + + + + // A sideband stream channel that is independent from above. // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is // asynchronous wrt the other streams. @@ -383,7 +425,10 @@ int main(int argc, char *argv[]) // Tests on first GPU: + // + // This test has no synchronization - make sure it mismatches so we can ensure the other tests properyl prevent the mismatch: RUN_SYNC_TEST(0x01, streamersDev0, sync_none(), false); + RUN_SYNC_TEST(0x02, streamersDev0, sync_allDevices(numDevices), true); RUN_SYNC_TEST(0x04, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); RUN_SYNC_TEST(0x08, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); @@ -419,5 +464,19 @@ int main(int argc, char *argv[]) } + // Change Adds to copies to stimulate different case with event followign copy: + for (auto &s : streamers) { + if (s->_commandType == COMMAND_ADD_FORWARD) + s->_commandType = COMMAND_COPY; + } + + + { + printf ("test: alternating memcpy/count-reverse followed by event\n"); + RUN_SYNC_TEST(0x4000, streamersDev0, sync_queryAllUntilComplete(streamersDev0), true); + RUN_SYNC_TEST(0x8000, streamersDev0, sync_streamWaitEvent(streamersDev0.back()->event(), 0, sideStreams[0], false), true); + } + + passed(); } From ae983e1b094781675dd350ffac595823b2ead5c5 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 23 May 2017 23:47:56 -0500 Subject: [PATCH 101/171] Remove HIP_MAX_QUEUES (replaced with HCC_MAX_QUEUES) [ROCm/clr commit: d43d57d39c06c5a3777225b50808f8f4c264f1af] --- projects/clr/hipamd/src/hip_hcc.cpp | 91 +--------------------- projects/clr/hipamd/src/hip_hcc_internal.h | 12 --- projects/clr/hipamd/src/hip_memory.cpp | 3 - projects/clr/hipamd/src/hip_stream.cpp | 6 +- 4 files changed, 4 insertions(+), 108 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index efa05cbb93..e77c4186e8 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -64,7 +64,6 @@ std::string HIP_LAUNCH_BLOCKING_KERNELS; std::vector g_hipLaunchBlockingKernels; int HIP_API_BLOCKING = 0; -int HIP_MAX_QUEUES = 0; int HIP_PRINT_ENV = 0; int HIP_TRACE_API= 0; @@ -267,31 +266,6 @@ ihipStream_t::~ihipStream_t() } -inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit) -{ - if (HIP_MAX_QUEUES && !streamCrit->_hasQueue) { - - // To avoid deadlock, we have to release the stream lock before acquiring context lock. - // Else we can get hung if another thread has the context lock is trying to get lock for this stream. - // We lock it again below. - streamCrit->munlock(); - - // Obtain mutex access to the device critical data, release by destructor - LockedAccessor_CtxCrit_t ctxCrit(this->_ctx->criticalData()); - // TODO - auto needyCritPtr = this->_criticalData.mlock(); - - // Second test to ensure we still need to steal the queue - another thread may have - // snuck in here and already solved the issue. - if (!needyCritPtr->_hasQueue) { - needyCritPtr->_av = this->_ctx->stealActiveQueue(ctxCrit, this); - } - - streamCrit->_hasQueue = true; - } - assert(streamCrit->_hasQueue); -} - hc::hcWaitMode ihipStream_t::waitMode() const { hc::hcWaitMode waitMode = hc::hcWaitModeActive; @@ -323,13 +297,9 @@ hc::hcWaitMode ihipStream_t::waitMode() const //This signature should be used in routines that already have locked the stream mutex void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit) { - if (crit->_hasQueue) { - tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); + tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str()); - crit->_av.wait(waitMode()); - } else { - tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str()); - } + crit->_av.wait(waitMode()); crit->_kernelCnt = 0; } @@ -350,7 +320,6 @@ void ihipStream_t::locked_waitEvent(hipEvent_t event) { LockedAccessor_StreamCrit_t crit(_criticalData); - this->ensureHaveQueue(crit); crit->_av.create_blocking_marker(event->_marker, hc::accelerator_scope); } @@ -362,7 +331,6 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) // Lock the stream to prevent simultaneous access LockedAccessor_StreamCrit_t crit(_criticalData); - this->ensureHaveQueue(crit); #if USE_NO_SCOPE printf ("create_marker, flags = %x\n", event->_flags); event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); @@ -406,7 +374,6 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() crit->_kernelCnt = 0; } - this->ensureHaveQueue(crit); @@ -1001,55 +968,6 @@ std::string ihipCtx_t::toString() const }; -hc::accelerator_view -ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream) -{ - - // TODO - review handling if queue can't be found. - while (1) { - - for (auto iter=ctxCrit->streams().begin(); iter != ctxCrit->streams().end(); iter++) { - if (*iter != needyStream) { - auto victimCritPtr = (*iter)->_criticalData.mtry_lock(); - if (victimCritPtr) { - // try-lock succeeded: - if (victimCritPtr->_hasQueue && (victimCritPtr->_kernelCnt == 0)) { - - victimCritPtr->_hasQueue = false; - - tprintf(DB_SYNC, " stealActiveQueue from victim:%s to needy:%s\n", - ToString(*iter).c_str(), ToString(needyStream).c_str()); - - hc::accelerator_view av = victimCritPtr->_av; - - // TODO - cleanup to remove forced setting to N - uint64_t *p = (uint64_t*)(&victimCritPtr->_av); - *p = 0; // damage the victim av so attempt to use it will fault. - - (*iter)->_criticalData.munlock(); - return av; - } - (*iter)->_criticalData.munlock(); - } - } - } - } -} - - -hc::accelerator_view -ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit) -{ - if (HIP_MAX_QUEUES && (ctxCrit->streams().size() >= HIP_MAX_QUEUES)) { - // Steal a queue from an existing stream: - hc::accelerator_view av = this->stealActiveQueue (ctxCrit, nullptr); - return av; - } else { - // Create a new view - return getWriteableDevice()->_acc.create_view(); - } -} - //---- @@ -1279,7 +1197,6 @@ void HipReadEnv() READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed. Impacts hipMemcpyAsync, hipMemsetAsync." ); - READ_ENV_I(release, HIP_MAX_QUEUES, 0, "Maximum number of queues that this app will use per-device. Additional streams will share the specified number of queues. 0=no limit."); READ_ENV_C(release, HIP_DB, 0, "Print debug info. Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback); if ((HIP_DB & (1<ensureHaveQueue(crit); crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); } @@ -2078,7 +1994,6 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes // Perform fast asynchronous copy - we know copyDevice != NULL based on check above try { - this->ensureHaveQueue(crit); if (HIP_FORCE_SYNC_COPY) { crit->_av.copy_ext (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, ©Device->getDevice()->_acc, forceUnpinnedCopy); @@ -2115,7 +2030,6 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes // Perform slow synchronous copy: LockedAccessor_StreamCrit_t crit(_criticalData); - this->ensureHaveQueue(crit); crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? ©Device->getDevice()->_acc : nullptr, forceUnpinnedCopy); } @@ -2170,7 +2084,6 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc) //--- -// Warning - with HIP_MAX_QUEUES!=0 there is no mechanism to prevent accelerator_view from being re-assigned... hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av) { HIP_INIT_API(stream, av); diff --git a/projects/clr/hipamd/src/hip_hcc_internal.h b/projects/clr/hipamd/src/hip_hcc_internal.h index 0d080f9225..278f52dc51 100644 --- a/projects/clr/hipamd/src/hip_hcc_internal.h +++ b/projects/clr/hipamd/src/hip_hcc_internal.h @@ -447,7 +447,6 @@ public: ihipStreamCriticalBase_t(ihipStream_t *parentStream, hc::accelerator_view av) : _kernelCnt(0), _av(av), - _hasQueue(true), _parent(parentStream) { }; @@ -473,11 +472,6 @@ public: uint32_t _kernelCnt; // Count of inflight kernels in this stream. Reset at ::wait(). hc::accelerator_view _av; - - // True if the stream has an allocated queue (accelerato_view) for its use: - // Always true at ihipStream creation but queue may later be stolen. - // This acts as a valid bit for the _av. - bool _hasQueue; private: }; @@ -544,8 +538,6 @@ public: const ihipDevice_t * getDevice() const; ihipCtx_t * getCtx() const; - void ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit); - public: //--- //Public member vars - these are set at initialization and never change: @@ -792,10 +784,6 @@ public: // Functions: void locked_waitAllStreams(); void locked_syncDefaultStream(bool waitOnSelf, bool syncHost); - // Will allocate a queue and assign it to the needyStream: - hc::accelerator_view stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream); - hc::accelerator_view createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit); - ihipCtxCritical_t &criticalData() { return _criticalData; }; const ihipDevice_t *getDevice() const { return _device; }; diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 3f95cd22b4..a3d761752e 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -948,7 +948,6 @@ hipError_t hipMemsetAsync(void* dst, int value, size_t sizeBytes, hipStream_t s if (stream) { auto crit = stream->lockopen_preKernelCommand(); - stream->ensureHaveQueue(crit); hc::completion_future cf ; @@ -1000,7 +999,6 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes ) if (stream) { auto crit = stream->lockopen_preKernelCommand(); - stream->ensureHaveQueue(crit); hc::completion_future cf ; if ((sizeBytes & 0x3) == 0) { @@ -1053,7 +1051,6 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeByte if (stream) { auto crit = stream->lockopen_preKernelCommand(); - stream->ensureHaveQueue(crit); hc::completion_future cf ; if ((sizeBytes & 0x3) == 0) { diff --git a/projects/clr/hipamd/src/hip_stream.cpp b/projects/clr/hipamd/src/hip_stream.cpp index 34b4bc8851..b4a0740b96 100644 --- a/projects/clr/hipamd/src/hip_stream.cpp +++ b/projects/clr/hipamd/src/hip_stream.cpp @@ -49,7 +49,7 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags) // Obtain mutex access to the device critical data, release by destructor LockedAccessor_CtxCrit_t ctxCrit(ctx->criticalData()); - auto istream = new ihipStream_t(ctx, ctx->createOrStealQueue(ctxCrit), flags); + auto istream = new ihipStream_t(ctx, acc.create_view(), flags); ctxCrit->addStream(istream); *stream = istream; @@ -129,9 +129,7 @@ hipError_t hipStreamQuery(hipStream_t stream) { LockedAccessor_StreamCrit_t crit(stream->_criticalData); - if (crit->_hasQueue) { - pendingOps = crit->_av.get_pending_async_ops(); - } + pendingOps = crit->_av.get_pending_async_ops(); } From d3024987877aea4447bffe1d72e0b4ffff19593d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 24 May 2017 00:48:10 -0500 Subject: [PATCH 102/171] Add hipHostMallocCoherent, hipHostMallocNonCoherent Provide per-allocation control over coherent/non-coherent mem. These overrid the default HIP_COHERENT_HOST_ALLOC setting. [ROCm/clr commit: dda70ae5141c30e0e17dc9bca60feb7f258c17e3] --- .../include/hip/hcc_detail/hip_runtime_api.h | 10 ++- .../include/hip/nvcc_detail/hip_runtime_api.h | 2 + projects/clr/hipamd/src/hip_hcc.cpp | 2 +- projects/clr/hipamd/src/hip_memory.cpp | 31 +++++-- .../src/runtimeApi/memory/hipHostMalloc.cpp | 83 +++++++++++++++---- projects/clr/hipamd/util/vim/hip.vim | 2 + 6 files changed, 104 insertions(+), 26 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 34ed2ed5ce..6fb7c0256e 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -111,17 +111,21 @@ enum hipLimit_t //! Flags that can be used with hipHostMalloc #define hipHostMallocDefault 0x0 -#define hipHostMallocPortable 0x1 -#define hipHostMallocMapped 0x2 +#define hipHostMallocPortable 0x1 ///< Memory is considered allocated by all contexts. +#define hipHostMallocMapped 0x2 ///< Map the allocation into the address space for the current device. The device pointer can be obtained with #hipHostGetDevicePointer. #define hipHostMallocWriteCombined 0x4 +#define hipHostMallocCoherent 0x40000000 ///< Allocate coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation. +#define hipHostMallocNonCoherent 0x80000000 ///< Allocate non-coherent memory. Overrides HIP_COHERENT_HOST_ALLOC for specific allocation. + //! Flags that can be used with hipHostRegister #define hipHostRegisterDefault 0x0 ///< Memory is Mapped and Portable -#define hipHostRegisterPortable 0x1 ///< Memory is considered registered by all contexts. HIP only supports one context so this is always assumed true. +#define hipHostRegisterPortable 0x1 ///< Memory is considered registered by all contexts. #define hipHostRegisterMapped 0x2 ///< Map the allocation into the address space for the current device. The device pointer can be obtained with #hipHostGetDevicePointer. #define hipHostRegisterIoMemory 0x4 ///< Not supported. + #define hipDeviceScheduleAuto 0x0 ///< Automatically select between Spin and Yield #define hipDeviceScheduleSpin 0x1 ///< Dedicate a CPU core to spin-wait. Provides lowest latency, but burns a CPU core and may consume more power. #define hipDeviceScheduleYield 0x2 ///< Yield the CPU to the operating system when waiting. May increase latency, but lowers power and is friendlier to other threads in the system. diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index 01a93f7ba4..cbc7ed9f9c 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -65,6 +65,8 @@ hipMemcpyHostToHost #define hipHostMallocPortable cudaHostAllocPortable #define hipHostMallocMapped cudaHostAllocMapped #define hipHostMallocWriteCombined cudaHostAllocWriteCombined +#define hipHostMallocCoherent 0x0 +#define hipHostMallocNonCoherent 0x0 #define hipHostRegisterPortable cudaHostRegisterPortable #define hipHostRegisterMapped cudaHostRegisterMapped diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index e77c4186e8..4588f67c2d 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -74,7 +74,7 @@ int HIP_PROFILE_API= 0; std::string HIP_DB_START_API; std::string HIP_DB_STOP_API; int HIP_DB= 0; -int HIP_VISIBLE_DEVICES = 0; /* Contains a comma-separated sequence of GPU identifiers */ +int HIP_VISIBLE_DEVICES = 0; int HIP_NUM_KERNELS_INFLIGHT = 128; int HIP_WAIT_MODE = 0; diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index a3d761752e..3ab7713afa 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -267,17 +267,36 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) trueFlags = hipHostMallocMapped | hipHostMallocPortable; } - const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | hipHostMallocWriteCombined; - if (flags & ~supportedFlags) { + const unsigned supportedFlags = hipHostMallocPortable + | hipHostMallocMapped + | hipHostMallocWriteCombined + | hipHostMallocCoherent + | hipHostMallocNonCoherent; + + + const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; + + if ((flags & ~supportedFlags) || + ((flags & coherencyFlags) == coherencyFlags)) { + *ptr = nullptr; + // can't specify unsupported flags, can't specify both Coherent + NonCoherent hip_status = hipErrorInvalidValue; - } - else { + } else { auto device = ctx->getWriteableDevice(); - unsigned amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + + unsigned amFlags = 0; + if (flags & hipHostMallocCoherent) { + amFlags = amHostCoherent; + } else if (flags & hipHostMallocNonCoherent) { + amFlags = amHostPinned; + } else { + // depends on env variables: + amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + } - *ptr = hip_internal::allocAndSharePtr(HIP_COHERENT_HOST_ALLOC ? "finegrained_host":"pinned_host", + *ptr = hip_internal::allocAndSharePtr((amFlags & amHostCoherent) ? "finegrained_host":"pinned_host", sizeBytes, ctx, (trueFlags & hipHostMallocPortable) /*shareWithAll*/, amFlags, flags); if(sizeBytes && (*ptr == NULL)){ diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp index d6b3b05a1d..31596b5ea5 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -31,14 +31,19 @@ #define LEN 1024*1024 #define SIZE LEN*sizeof(float) -__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){ +__global__ void Add(float *Ad, float *Bd, float *Cd){ int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; Cd[tx] = Ad[tx] + Bd[tx]; } + +__global__ void Set(int *Ad, int val){ + int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + Ad[tx] = val; +} + int main(){ - float *A, *B, *C; - float *Ad, *Bd, *Cd; + hipDeviceProp_t prop; int device; @@ -49,26 +54,72 @@ int main(){ failed("Does support HostPinned Memory"); } - HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped)); - HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault)); - HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped)); - HIPCHECK(hipHostGetDevicePointer((void**)&Ad, A, 0)); - HIPCHECK(hipHostGetDevicePointer((void**)&Cd, C, 0)); + { + float *A, *B, *C; + float *Ad, *Bd, *Cd; + HIPCHECK(hipHostMalloc((void**)&A, SIZE, hipHostMallocWriteCombined | hipHostMallocMapped)); + HIPCHECK(hipHostMalloc((void**)&B, SIZE, hipHostMallocDefault)); + HIPCHECK(hipHostMalloc((void**)&C, SIZE, hipHostMallocMapped)); - for(int i=0;i Date: Wed, 24 May 2017 01:03:28 -0500 Subject: [PATCH 103/171] Remove HIP_NUM_KERNELS_INFLIGHT. (redundant with HCC controls) [ROCm/clr commit: 35212632e7e82c44724e12a01673c7e407f3b196] --- projects/clr/hipamd/docs/markdown/hip_porting_guide.md | 1 - projects/clr/hipamd/src/hip_hcc.cpp | 10 ---------- 2 files changed, 11 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_porting_guide.md b/projects/clr/hipamd/docs/markdown/hip_porting_guide.md index 72f6384f6d..84887fd512 100644 --- a/projects/clr/hipamd/docs/markdown/hip_porting_guide.md +++ b/projects/clr/hipamd/docs/markdown/hip_porting_guide.md @@ -569,7 +569,6 @@ HIP_TRACE_API = 0 : Trace each HIP API call. Print function n HIP_TRACE_API_COLOR = green : Color to use for HIP_API. None/Red/Green/Yellow/Blue/Magenta/Cyan/White HIP_PROFILE_API = 0 : Add HIP function begin/end to ATP file generated with CodeXL HIP_VISIBLE_DEVICES = 0 : Only devices whose index is present in the secquence are visible to HIP applications and they are enumerated in the order of secquence -HIP_NUM_KERNELS_INFLIGHT = 128 : Number of kernels per stream ``` diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 4588f67c2d..8e4a20ad74 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -75,7 +75,6 @@ std::string HIP_DB_START_API; std::string HIP_DB_STOP_API; int HIP_DB= 0; int HIP_VISIBLE_DEVICES = 0; -int HIP_NUM_KERNELS_INFLIGHT = 128; int HIP_WAIT_MODE = 0; int HIP_FORCE_P2P_HOST = 0; @@ -369,13 +368,6 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand() LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/); - if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){ - this->wait(crit); - crit->_kernelCnt = 0; - } - - - return crit; } @@ -1225,8 +1217,6 @@ void HipReadEnv() READ_ENV_I(release, HIP_SYNC_HOST_ALLOC, 0, "Sync before and after all host memory allocations. May help stability"); READ_ENV_I(release, HIP_SYNC_NULL_STREAM, 0, "Synchronize on host for null stream submissions"); - // TODO - review, can we remove this? - READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced."); READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); From ed54e3d0ee623b2a87194121ee8e259e6daff6ff Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 24 May 2017 18:25:40 +0300 Subject: [PATCH 104/171] [FIX] [HIPIFY] Add matchers for function return types. https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/issues/73 Examples (https://github.com/thrust/thrust/blob/master/thrust/system/cuda/detail/trivial_copy.inl): template cudaStream_t cuda_memcpy_stream(const thrust::cpp::execution_policy &, const thrust::cuda::execution_policy &exec) template cudaMemcpyKind cuda_memcpy_kind(const thrust::cuda::execution_policy &, const thrust::cpp::execution_policy &) [ROCm/clr commit: a19ecab3f2c8041deef4267bcf25dd084162b670] --- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index dcb9c3d216..f17c3e2646 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -3123,6 +3123,33 @@ private: return false; } + bool cudaFunctionReturn(const MatchFinder::MatchResult &Result) { + if (const auto *ret = Result.Nodes.getNodeAs("cudaFunctionReturn")) { + QualType QT = ret->getReturnType(); + SourceManager *SM = Result.SourceManager; + SourceRange sr = ret->getReturnTypeSourceRange(); + SourceLocation sl = sr.getBegin(); + std::string name = QT.getAsString(); + if (QT.getTypePtr()->isEnumeralType()) { + name = QT.getTypePtr()->getAs()->getDecl()->getNameAsString(); + } + const auto found = N.cuda2hipRename.find(name); + if (found != N.cuda2hipRename.end()) { + updateCounters(found->second, name); + if (!found->second.unsupported) { + StringRef repName = found->second.hipName; + Replacement Rep(*SM, sl, name.size(), repName); + FullSourceLoc fullSL(sl, *SM); + insertReplacement(Rep, fullSL); + } + } + else { + std::string msg = "the following reference is not handled: '" + name + "' [function return]."; + printHipifyMessage(*SM, sl, msg); + } + } + return false; + } bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) { StringRef refName = "cudaSharedIncompleteArrayVar"; @@ -3269,6 +3296,7 @@ public: if (cudaParamDeclPtr(Result)) break; if (cudaLaunchKernel(Result)) break; if (cudaNewOperatorDecl(Result)) break; + if (cudaFunctionReturn(Result)) break; if (cudaSharedIncompleteArrayVar(Result)) break; if (stringLiteral(Result)) break; if (unresolvedTemplateName(Result)) break; @@ -3373,6 +3401,16 @@ void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callbac hasType(pointsTo(namedDecl(matchesName("cu.*|CU.*"))))) .bind("cudaNewOperatorDecl"), Callback); + // Examples: + // 1. + // cudaStream_t cuda_memcpy_stream(...) + // 2. + // template cudaMemcpyKind cuda_memcpy_kind(...) + Finder.addMatcher(functionDecl(isExpansionInMainFile(), + returns(hasDeclaration(namedDecl(matchesName("cu.*|CU.*"))))) + .bind("cudaFunctionReturn"), + Callback); + } int64_t printStats(const std::string &csvFile, const std::string &srcFile, From 6c3a05ac5bdabfde927b3fe1cd9c5bbea4063e92 Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Thu, 25 May 2017 23:15:30 -0400 Subject: [PATCH 105/171] fix hip_fast_dsqrt* to call a double fp sqrt function [ROCm/clr commit: a3595d2e8c1e5fe6ba7434a3ae9cb26ab172fbdd] --- projects/clr/hipamd/src/device_util.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/src/device_util.cpp b/projects/clr/hipamd/src/device_util.cpp index b730412874..bea42aba46 100644 --- a/projects/clr/hipamd/src/device_util.cpp +++ b/projects/clr/hipamd/src/device_util.cpp @@ -1215,20 +1215,23 @@ __device__ float __hip_fast_tanf(float x) { } // Double Precision Math +// FIXME - HCC doesn't have a fast_math version double FP sqrt +// Another issue is that these intrinsics call for a specific rounding mode; +// however, their implementation all map to the same sqrt builtin __device__ double __hip_fast_dsqrt_rd(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ double __hip_fast_dsqrt_rn(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ double __hip_fast_dsqrt_ru(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ double __hip_fast_dsqrt_rz(double x) { - return hc::fast_math::sqrt(x); + return hc::precise_math::sqrt(x); } __device__ void __threadfence_system(void){ From d9587ae2f01ba7c196a125ae4e6827b098c076a6 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Thu, 25 May 2017 10:37:03 -0500 Subject: [PATCH 106/171] Add isDefaultStream() accessor. Fix code that checked for stream==nullptr after stream had been resolved to a "true stream". [ROCm/clr commit: b2b620c12b2415bd6856db659b63ac16d2d705c2] --- projects/clr/hipamd/src/hip_event.cpp | 19 +++++++++++++++---- projects/clr/hipamd/src/hip_hcc_internal.h | 5 ++++- projects/clr/hipamd/src/hip_stream.cpp | 4 ++-- 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index fbaf5cc463..c11a47b341 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -53,14 +53,19 @@ void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihip void ihipEvent_t::setTimestamp() { + bool isReady0 = _marker.is_ready(); + bool isReady1; + int val = 0; if (_state == hipEventStatusRecorded) { // already recorded, done: return; } else { // TODO - use completion-future functions to obtain ticks and timestamps: hsa_signal_t *sig = static_cast (_marker.get_native_handle()); + isReady1 = _marker.is_ready(); if (sig) { - if (hsa_signal_load_acquire(*sig) == 0) { + val = hsa_signal_load_acquire(*sig); + if (val == 0) { if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) { _timestamp = _marker.get_end_tick(); @@ -75,6 +80,10 @@ void ihipEvent_t::setTimestamp() } } } + + if (_state != hipEventStatusRecorded) { + printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); + } } @@ -118,11 +127,11 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) event->_stream = stream; - if (HIP_SYNC_NULL_STREAM && stream == NULL) { + if (HIP_SYNC_NULL_STREAM && stream->isDefaultStream()) { // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0 - // If stream == NULL, wait on all queues. + // If default stream , then wait on all queues. ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true, true); @@ -167,7 +176,7 @@ hipError_t hipEventSynchronize(hipEvent_t event) } else if (event->_state == hipEventStatusCreated ) { // Created but not actually recorded on any device: return ihipLogStatus(hipSuccess); - } else if (HIP_SYNC_NULL_STREAM && (event->_stream == NULL)) { + } else if (HIP_SYNC_NULL_STREAM && (event->_stream->isDefaultStream() )) { auto *ctx = ihipGetTlsDefaultCtx(); // TODO-HIP_SYNC_NULL_STREAM - can remove this code ctx->locked_syncDefaultStream(true, true); @@ -175,6 +184,8 @@ hipError_t hipEventSynchronize(hipEvent_t event) } else { event->_marker.wait((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); + assert (event->_marker.is_ready()); + return ihipLogStatus(hipSuccess); } } else { diff --git a/projects/clr/hipamd/src/hip_hcc_internal.h b/projects/clr/hipamd/src/hip_hcc_internal.h index 278f52dc51..94ad4f9340 100644 --- a/projects/clr/hipamd/src/hip_hcc_internal.h +++ b/projects/clr/hipamd/src/hip_hcc_internal.h @@ -538,10 +538,12 @@ public: const ihipDevice_t * getDevice() const; ihipCtx_t * getCtx() const; + bool isDefaultStream() const { return _id == 0; }; + public: //--- //Public member vars - these are set at initialization and never change: - SeqNum_t _id; // monotonic sequence ID + SeqNum_t _id; // monotonic sequence ID. 0 is the default stream. unsigned _flags; @@ -560,6 +562,7 @@ private: void addSymbolPtrToTracker(hc::accelerator& acc, void* ptr, size_t sizeBytes); + public: // TODO - move private // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t ihipStreamCritical_t _criticalData; diff --git a/projects/clr/hipamd/src/hip_stream.cpp b/projects/clr/hipamd/src/hip_stream.cpp index b4a0740b96..9f1228d6f7 100644 --- a/projects/clr/hipamd/src/hip_stream.cpp +++ b/projects/clr/hipamd/src/hip_stream.cpp @@ -146,7 +146,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) hipError_t e = hipSuccess; - if (stream == NULL) { + if (stream == hipStreamNull) { ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/); } else { @@ -198,7 +198,7 @@ hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags) if (flags == NULL) { return ihipLogStatus(hipErrorInvalidValue); - } else if (stream == NULL) { + } else if (stream == hipStreamNull) { return ihipLogStatus(hipErrorInvalidResourceHandle); } else { *flags = stream->_flags; From 9442c6dd2dfc16d088bbb501a330d1b08c96080c Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 27 May 2017 15:55:07 -0500 Subject: [PATCH 107/171] Updates so hip compiles on CUDA. [ROCm/clr commit: 8dc968f036db674f8908e4364e4a6fbb25705492] --- .../include/hip/hcc_detail/hip_runtime_api.h | 25 +++++++++++++++++++ .../clr/hipamd/include/hip/hip_runtime_api.h | 23 ----------------- .../src/runtimeApi/stream/hipNullStream.cpp | 2 +- .../runtimeApi/stream/hipStreamWaitEvent.cpp | 2 +- projects/clr/hipamd/tests/src/test_common.h | 2 +- 5 files changed, 28 insertions(+), 26 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 6fb7c0256e..a8db84c4f2 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -136,6 +136,31 @@ enum hipLimit_t #define hipDeviceLmemResizeToMax 0x16 +/* +* @brief hipJitOption +* @enum +* @ingroup Enumerations +*/ +typedef enum hipJitOption { + hipJitOptionMaxRegisters = 0, + hipJitOptionThreadsPerBlock, + hipJitOptionWallTime, + hipJitOptionInfoLogBuffer, + hipJitOptionInfoLogBufferSizeBytes, + hipJitOptionErrorLogBuffer, + hipJitOptionErrorLogBufferSizeBytes, + hipJitOptionOptimizationLevel, + hipJitOptionTargetFromContext, + hipJitOptionTarget, + hipJitOptionFallbackStrategy, + hipJitOptionGenerateDebugInfo, + hipJitOptionLogVerbose, + hipJitOptionGenerateLineInfo, + hipJitOptionCacheMode, + hipJitOptionNumOptions +} hipJitOption; + + /** * @warning On AMD devices and recent Nvidia devices, these hints and controls are ignored. */ diff --git a/projects/clr/hipamd/include/hip/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hip_runtime_api.h index fa54dda5dc..dc163d5c25 100644 --- a/projects/clr/hipamd/include/hip/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hip_runtime_api.h @@ -250,29 +250,6 @@ typedef enum hipDeviceAttribute_t { hipDeviceAttributeIsMultiGpuBoard, ///< Multiple GPU devices. } hipDeviceAttribute_t; -/* -* @brief hipJitOption -* @enum -* @ingroup Enumerations -*/ -typedef enum hipJitOption { - hipJitOptionMaxRegisters = 0, - hipJitOptionThreadsPerBlock, - hipJitOptionWallTime, - hipJitOptionInfoLogBuffer, - hipJitOptionInfoLogBufferSizeBytes, - hipJitOptionErrorLogBuffer, - hipJitOptionErrorLogBufferSizeBytes, - hipJitOptionOptimizationLevel, - hipJitOptionTargetFromContext, - hipJitOptionTarget, - hipJitOptionFallbackStrategy, - hipJitOptionGenerateDebugInfo, - hipJitOptionLogVerbose, - hipJitOptionGenerateLineInfo, - hipJitOptionCacheMode, - hipJitOptionNumOptions -} hipJitOption; /** * @} diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp index 380979f6bc..b610315608 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipNullStream.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * RUN: %t * HIT_END */ diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index d12b07289b..9bbd43828c 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -18,7 +18,7 @@ THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 * RUN: %t * HIT_END */ diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index bb44c94745..81edca4e1e 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -249,7 +249,7 @@ void initArraysForHost(T **A_h, T **B_h, T **C_h, } } - setDefaultData(N, A_h ? *A_h : nullptr, B_h ? *B_h : nullptr, C_h ? *C_h : nullptr); + setDefaultData(N, A_h ? *A_h : NULL, B_h ? *B_h : NULL, C_h ? *C_h : NULL); } From d6e8f5bbdc2ac039b650215d1a1f50ae83226b8b Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 26 May 2017 14:48:27 -0500 Subject: [PATCH 108/171] Cleanup hipEvent. (Intermediate checkpoint) Support hipEventDisableSystemRelease flag. Update test. Remove stray printf [ROCm/clr commit: c8178c6838c70f71364f321309d36af9238e3047] --- projects/clr/hipamd/src/hip_event.cpp | 28 ++--- projects/clr/hipamd/src/hip_hcc.cpp | 6 +- .../src/runtimeApi/memory/hipHostMalloc.cpp | 102 ++++++++++++++---- 3 files changed, 100 insertions(+), 36 deletions(-) diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index c11a47b341..71f6d8ed5b 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -82,7 +82,7 @@ void ihipEvent_t::setTimestamp() } if (_state != hipEventStatusRecorded) { - printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); + //printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); } } @@ -92,7 +92,10 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) hipError_t e = hipSuccess; // TODO-IPC - support hipEventInterprocess. - unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming; + unsigned supportedFlags = hipEventDefault + | hipEventBlockingSync + | hipEventDisableTiming + | hipEventDisableSystemRelease; if ((flags & ~supportedFlags) == 0) { ihipEvent_t *eh = new ihipEvent_t(flags); @@ -197,20 +200,18 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) { HIP_INIT_API(ms, start, stop); - ihipEvent_t *start_eh = start; - ihipEvent_t *stop_eh = stop; - start->setTimestamp(); stop->setTimestamp(); hipError_t status = hipSuccess; *ms = 0.0f; - if (start_eh && stop_eh) { - if ((start_eh->_state == hipEventStatusRecorded) && (stop_eh->_state == hipEventStatusRecorded)) { + if (start && stop) { + // refresh status: + if ((start->_state == hipEventStatusRecorded) && (stop->_state == hipEventStatusRecorded)) { // Common case, we have good information for both events. - int64_t tickDiff = (stop_eh->timestamp() - start_eh->timestamp()); + int64_t tickDiff = (stop->timestamp() - start->timestamp()); uint64_t freqHz; hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); @@ -223,13 +224,16 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) } - } else if ((start_eh->_state == hipEventStatusRecording) || - (stop_eh->_state == hipEventStatusRecording)) { + } else if ((start->_state == hipEventStatusRecording) || + (stop->_state == hipEventStatusRecording)) { + status = hipErrorNotReady; - } else if ((start_eh->_state == hipEventStatusUnitialized) || - (stop_eh->_state == hipEventStatusUnitialized)) { + } else if ((start->_state == hipEventStatusUnitialized) || + (stop->_state == hipEventStatusUnitialized)) { status = hipErrorInvalidResourceHandle; } + } else { + status = hipErrorInvalidResourceHandle; } return ihipLogStatus(status); diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 8e4a20ad74..5e13904521 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -49,7 +49,7 @@ THE SOFTWARE. // needs HCC change for hc::no_scope -#define USE_NO_SCOPE 0 +#define USE_NO_SCOPE 1 //================================================================================================= //Global variables: @@ -331,10 +331,10 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) LockedAccessor_StreamCrit_t crit(_criticalData); #if USE_NO_SCOPE - printf ("create_marker, flags = %x\n", event->_flags); + //printf ("create_marker, flags = %x\n", event->_flags); event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); #else - event->_marker = crit->_av.create_marker(); + event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::accelerator_scope : hc::system_scope); #endif }; diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 31596b5ea5..0e88570e17 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -42,6 +42,63 @@ __global__ void Set(int *Ad, int val){ Ad[tx] = val; } + +#define SYNC_EVENT 0 +#define SYNC_STREAM 1 +#define SYNC_DEVICE 2 + +std::vector syncMsg = {"event", "stream", "device"}; + +void CheckHostPointer(int numElements, int *ptr, int syncMethod, std::string msg) +{ + std::cerr << "test: CheckHostPointer " << msg + << " ptr=" << ptr + << " syncMethod=" << syncMsg[syncMethod] << "\n"; + + hipStream_t s; + hipEvent_t e; + + // Init: + HIPCHECK(hipStreamCreate(&s)); + HIPCHECK(hipEventCreateWithFlags(&e, hipEventDisableSystemRelease)); + dim3 dimBlock(64,1,1); + dim3 dimGrid(numElements/dimBlock.x,1,1); + + const int expected = 13; + + // Init array to know state: + hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, 0x0, ptr, -42); + HIPCHECK(hipDeviceSynchronize()); + + hipLaunchKernelGGL(Set, dimGrid, dimBlock, 0, s, ptr, expected); + HIPCHECK(hipEventRecord(e, s)); + + // Host waits for event : + switch (syncMethod) { + case SYNC_EVENT: + HIPCHECK(hipEventSynchronize(e)); + break; + case SYNC_STREAM: + HIPCHECK(hipStreamSynchronize(s)); + break; + case SYNC_DEVICE: + HIPCHECK(hipDeviceSynchronize()); + break; + default: + assert(0); + }; + + for (int i=0; i Date: Sat, 27 May 2017 16:01:23 -0500 Subject: [PATCH 109/171] Add event controls for release fences. Env var : HIP_EVENT_SYS_RELEASE Event allocation flags : hipEventReleaseToDevice, hipEventReleaseToSystem (remove hipEventDisableSystemRelease) Update test for new functionality. [ROCm/clr commit: 942ec0eff8cded9995ee8335a27534243e681606] --- .../include/hip/hcc_detail/hip_runtime_api.h | 3 ++- .../include/hip/nvcc_detail/hip_runtime_api.h | 3 ++- projects/clr/hipamd/src/hip_event.cpp | 11 ++++++-- projects/clr/hipamd/src/hip_hcc.cpp | 24 ++++++++++++----- .../src/runtimeApi/memory/hipHostMalloc.cpp | 27 ++++++++++++------- 5 files changed, 48 insertions(+), 20 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index a8db84c4f2..6059e1e92d 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -106,7 +106,8 @@ enum hipLimit_t #define hipEventBlockingSync 0x1 ///< Waiting will yield CPU. Power-friendly and usage-friendly but may increase latency. #define hipEventDisableTiming 0x2 ///< Disable event's capability to record timing information. May improve performance. #define hipEventInterprocess 0x4 ///< Event can support IPC. @warning - not supported in HIP. -#define hipEventDisableSystemRelease 0x80000000 /// < Disable the system-scope release that event normally performs when it records. This flag is useful to obtain more precise timings of commands between events. The flag is a no-op on CUDA platforms. +#define hipEventReleaseToDevice 0x40000000 /// < Use a device-scope release when recording this event. This flag is useful to obtain more precise timings of commands between events. The flag is a no-op on CUDA platforms. +#define hipEventReleaseToSystem 0x80000000 /// < Use a system-scope release that when recording this event. This flag is useful to make non-coherent host memory visible to the host. The flag is a no-op on CUDA platforms. //! Flags that can be used with hipHostMalloc diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index cbc7ed9f9c..b09c9323c7 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -58,7 +58,8 @@ hipMemcpyHostToHost #define hipEventBlockingSync cudaEventBlockingSync #define hipEventDisableTiming cudaEventDisableTiming #define hipEventInterprocess cudaEventInterprocess -#define hipEventDisableSystemRelease cudaEventDefault /* no-op on CUDA platform */ +#define hipEventReleaseToDevice 0 /* no-op on CUDA platform */ +#define hipEventReleaseToSystem 0 /* no-op on CUDA platform */ #define hipHostMallocDefault cudaHostAllocDefault diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index 71f6d8ed5b..2c31769718 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -95,8 +95,15 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) unsigned supportedFlags = hipEventDefault | hipEventBlockingSync | hipEventDisableTiming - | hipEventDisableSystemRelease; - if ((flags & ~supportedFlags) == 0) { + | hipEventReleaseToDevice + | hipEventReleaseToSystem + ; + const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem); + + const bool illegalFlags = (flags & ~supportedFlags) || // can't set any unsupported flags. + (flags & releaseFlags) == releaseFlags; // can't set both + + if (!illegalFlags) { ihipEvent_t *eh = new ihipEvent_t(flags); *event = eh; diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 5e13904521..4400e4596e 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -93,8 +93,11 @@ int HIP_SYNC_HOST_ALLOC = 1; // Sync on host between int HIP_SYNC_NULL_STREAM = 1; +// HIP needs to change some behavior based on HCC_OPT_FLUSH : int HCC_OPT_FLUSH = 0; +int HIP_EVENT_SYS_RELEASE=0; + @@ -330,12 +333,18 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) // Lock the stream to prevent simultaneous access LockedAccessor_StreamCrit_t crit(_criticalData); -#if USE_NO_SCOPE - //printf ("create_marker, flags = %x\n", event->_flags); - event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::no_scope : hc::system_scope); -#else - event->_marker = crit->_av.create_marker((event->_flags & hipEventDisableSystemRelease) ? hc::accelerator_scope : hc::system_scope); -#endif + auto scopeFlag = hc::accelerator_scope; + // The env var HIP_EVENT_SYS_RELEASE sets the default, + // The explicit flags override the env var (if specified) + if (event->_flags & hipEventReleaseToSystem) { + scopeFlag = hc::system_scope; + } else if (event->_flags & hipEventReleaseToDevice) { + scopeFlag = hc::accelerator_scope; + } else { + scopeFlag = HIP_EVENT_SYS_RELEASE ? hc::system_scope : hc::accelerator_scope; + } + + event->_marker = crit->_av.create_marker(scopeFlag); }; //============================================================================= @@ -1221,7 +1230,8 @@ void HipReadEnv() READ_ENV_I(release, HIP_COHERENT_HOST_ALLOC, 0, "If set, all host memory will be allocated as fine-grained system memory. This allows threadfence_system to work but prevents host memory from being cached on GPU which may have performance impact."); - READ_ENV_I(release, HCC_OPT_FLUSH, 0, "Note this flag also impacts HCC. When set, use agent-scope flush rather than system-scope flush when possible."); + READ_ENV_I(release, HCC_OPT_FLUSH, 0, "When set, use agent-scope fence operations rather than system-scope fence operationsflush when possible. This flag controls both HIP and HCC behavior."); + READ_ENV_I(release, HIP_EVENT_SYS_RELEASE, 0, "If set, event are created with hipEventReleaseToSystem by default. If 0, events are created with hipEventReleaseToDevice by default. The defaults can be overridden by specifying hipEventReleaseToSystem or hipEventReleaseToDevice flag when creating the event."); // Some flags have both compile-time and runtime flags - generate a warning if user enables the runtime flag but the compile-time flag is disabled. if (HIP_DB && !COMPILE_HIP_DB) { diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 0e88570e17..54073e4901 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -49,9 +49,12 @@ __global__ void Set(int *Ad, int val){ std::vector syncMsg = {"event", "stream", "device"}; -void CheckHostPointer(int numElements, int *ptr, int syncMethod, std::string msg) +void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg) { std::cerr << "test: CheckHostPointer " << msg + << " eventFlags = " << std::hex << eventFlags + << ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "") + << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") << " ptr=" << ptr << " syncMethod=" << syncMsg[syncMethod] << "\n"; @@ -60,7 +63,7 @@ void CheckHostPointer(int numElements, int *ptr, int syncMethod, std::string msg // Init: HIPCHECK(hipStreamCreate(&s)); - HIPCHECK(hipEventCreateWithFlags(&e, hipEventDisableSystemRelease)); + HIPCHECK(hipEventCreateWithFlags(&e, eventFlags)) dim3 dimBlock(64,1,1); dim3 dimGrid(numElements/dimBlock.x,1,1); @@ -161,18 +164,24 @@ int main(){ int *A = nullptr; HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocNonCoherent)); const char *ptrType = "non-coherent"; // TODO - //CheckHostPointer(numElements, A, SYNC_DEVICE, ptrType); - //CheckHostPointer(numElements, A, SYNC_STREAM, ptrType); - CheckHostPointer(numElements, A, SYNC_EVENT, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_EVENT, ptrType); + + // agent-scope releases don't provide host visibility, don't use them here: } - if (0) { // TODO, remove me + if (1) { int *A = nullptr; HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent)); const char *ptrType = "coherent"; - CheckHostPointer(numElements, A, SYNC_DEVICE, ptrType); - CheckHostPointer(numElements, A, SYNC_STREAM, ptrType); - CheckHostPointer(numElements, A, SYNC_EVENT, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToDevice, SYNC_EVENT, ptrType); + + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, hipEventReleaseToSystem, SYNC_EVENT, ptrType); } From 06cdafe3115c7bba186d43e404afb1192e3de2bd Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Tue, 30 May 2017 15:45:22 +0530 Subject: [PATCH 110/171] Disable normcdfinvf on __host__ Change-Id: If7bfc9826a09eb9b7675ea2a417b9418759b7912 [ROCm/clr commit: 445012d4510cf22733bc90945e75ec243a4c8e60] --- projects/clr/hipamd/src/math_functions.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/src/math_functions.cpp b/projects/clr/hipamd/src/math_functions.cpp index 151627fc73..f66f0a4312 100644 --- a/projects/clr/hipamd/src/math_functions.cpp +++ b/projects/clr/hipamd/src/math_functions.cpp @@ -937,10 +937,10 @@ __host__ void sincospi(double x, double *sptr, double *cptr) *cptr = std::cos(HIP_PI*x); } -__host__ float normcdfinvf(float x) -{ - return std::sqrt(2) * erfinvf(2*x-1); -} +//__host__ float normcdfinvf(float x) +//{ +// return std::sqrt(2) * erfinvf(2*x-1); +//} //__host__ double normcdfinv(double x) //{ From ffb0d43b07dda765a09ed59f84ffb61d6eac7b57 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 30 May 2017 17:58:13 +0300 Subject: [PATCH 111/171] [HIPIFY] Add more CUDA Driver API 8.0.44 Data structures. [ROCm/clr commit: ef86f943ac78bb07f3eec0dc4fa8b26dcb333855] --- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index f17c3e2646..b163c4d20a 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -205,6 +205,7 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_ARRAY"] = {"hipErrorNotMappedAsArray", CONV_ERR, API_DRIVER}; // 212 cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_POINTER"] = {"hipErrorNotMappedAsPointer", CONV_ERR, API_DRIVER}; // 213 cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_IN_USE"] = {"hipErrorContextAlreadyInUse", CONV_ERR, API_DRIVER}; // 216 + cuda2hipRename["CUDA_ERROR_NVLINK_UNCORRECTABLE"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_DRIVER}; // 220 [CUDA 8.0.44] cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 @@ -695,6 +696,10 @@ struct cuda2hipMap { cuda2hipRename["CU_JIT_LOG_VERBOSE"] = {"hipJitOptionLogVerbose", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER}; + // unsupported yet by HIP [CUDA 8.0.44] + cuda2hipRename["CU_JIT_NEW_SM3X_OPT"] = {"hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_FAST_COMPILE"] = {"hipJitOptionFastCompile", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER}; // enum CUjit_target/CUjit_target_enum cuda2hipRename["CUjit_target"] = {"hipJitTarget", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) @@ -711,6 +716,11 @@ struct cuda2hipMap { cuda2hipRename["CU_TARGET_COMPUTE_37"] = {"hipJitTargetCompute37", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_TARGET_COMPUTE_50"] = {"hipJitTargetCompute50", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CU_TARGET_COMPUTE_52"] = {"hipJitTargetCompute52", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] + cuda2hipRename["CU_TARGET_COMPUTE_53"] = {"hipJitTargetCompute53", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_60"] = {"hipJitTargetCompute60", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_61"] = {"hipJitTargetCompute61", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_TARGET_COMPUTE_62"] = {"hipJitTargetCompute62", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // enum CUjitInputType/CUjitInputType_enum cuda2hipRename["CUjitInputType"] = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (no) cuda2hipRename["CUjitInputType_enum"] = {"hipJitInputType", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; @@ -831,6 +841,26 @@ struct cuda2hipMap { cuda2hipRename["CU_STREAM_DEFAULT"] = {"hipStreamDefault", CONV_STREAM, API_DRIVER}; cuda2hipRename["CU_STREAM_NON_BLOCKING"] = {"hipStreamNonBlocking", CONV_STREAM, API_DRIVER}; + // unsupported yet by HIP [CUDA 8.0.44] + // Flags for ::cuStreamWaitValue32 + cuda2hipRename["CUstreamWaitValue_flags"] = {"hipStreamWaitValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUstreamWaitValue_flags_enum"] = {"hipStreamWaitValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_STREAM_WAIT_VALUE_GEQ"] = {"hipStreamWaitValueGeq", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x0 + cuda2hipRename["CU_STREAM_WAIT_VALUE_EQ"] = {"hipStreamWaitValueEq", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x1 + cuda2hipRename["CU_STREAM_WAIT_VALUE_AND"] = {"hipStreamWaitValueAnd", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x2 + cuda2hipRename["CU_STREAM_WAIT_VALUE_FLUSH"] = {"hipStreamWaitValueFlush", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 1<<30 + // Flags for ::cuStreamWriteValue32 + cuda2hipRename["CUstreamWriteValue_flags"] = {"hipStreamWriteValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUstreamWriteValue_flags"] = {"hipStreamWriteValueFlags", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_STREAM_WRITE_VALUE_DEFAULT"] = {"hipStreamWriteValueDefault", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x0 + cuda2hipRename["CU_STREAM_WRITE_VALUE_NO_MEMORY_BARRIER"] = {"hipStreamWriteValueNoMemoryBarrier", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 0x1 + // Flags for ::cuStreamBatchMemOp + cuda2hipRename["CUstreamBatchMemOpType"] = {"hipStreamBatchMemOpType", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUstreamBatchMemOpType_enum"] = {"hipStreamBatchMemOpType", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_STREAM_MEM_OP_WAIT_VALUE_32"] = {"hipStreamBatchMemOpWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_STREAM_MEM_OP_WRITE_VALUE_32"] = {"hipStreamBatchMemOpWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES"] = {"hipStreamBatchMemOpFlushRemoteWrites", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 3 + // Init cuda2hipRename["cuInit"] = {"hipInit", CONV_DRIVER, API_DRIVER}; From 306dca2c78b40cafacccb10c38fb07cf58e9a318 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 30 May 2017 18:29:14 +0300 Subject: [PATCH 112/171] [HIPIFY] Add the rest CUDA Driver API 8.0.44 Data structures. + Memory advise values + Memory Range Attributes + P2P Attributes P.S. There is no any new changes in CUDA Driver API 8.0.61 Data structures since 8.0.44. [ROCm/clr commit: a020eb76dd0497c3e549c125c73e2cdb48f228aa] --- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 47 +++++++++++++++---- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index b163c4d20a..512144f3b9 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -431,17 +431,36 @@ struct cuda2hipMap { cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE"] = {"hipComputeModeExclusive", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaComputeModeExclusive = 1) cuda2hipRename["CU_COMPUTEMODE_PROHIBITED"] = {"hipComputeModeProhibited", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaComputeModeProhibited = 2) cuda2hipRename["CU_COMPUTEMODE_EXCLUSIVE_PROCESS"] = {"hipComputeModeExclusiveProcess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaComputeModeExclusiveProcess = 3) + + // unsupported yet by HIP [CUDA 8.0.44] + // Memory advise values + cuda2hipRename["CUmem_advise"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUmem_advise_enum"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_MEM_ADVISE_SET_READ_MOSTLY"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_MEM_ADVISE_UNSET_READ_MOSTLY"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_MEM_ADVISE_SET_PREFERRED_LOCATION"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 + cuda2hipRename["CU_MEM_ADVISE_SET_ACCESSED_BY"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 5 + cuda2hipRename["CU_MEM_ADVISE_UNSET_ACCESSED_BY"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 6 + // CUmem_range_attribute + cuda2hipRename["CUmem_range_attribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUmem_range_attribute_enum"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 + // Context flags - cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 - cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f + cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 + cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f // Defines cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; // ((void*)0x01) @@ -908,6 +927,14 @@ struct cuda2hipMap { cuda2hipRename["cuDeviceComputeCapability"] = {"hipDeviceComputeCapability", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceCanAccessPeer"] = {"hipDeviceCanAccessPeer", CONV_DEV, API_DRIVER}; + // unsupported yet by HIP [CUDA 8.0.44] + // P2P Attributes + cuda2hipRename["CUdevice_P2PAttribute"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + // cuda2hipRename["CUdevice_P2PAttribute_enum"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + // Events // pointer to CUevent_st cuda2hipRename["CUevent"] = {"hipEvent_t", CONV_TYPE, API_DRIVER}; From 54b3c90964d1ac57f45ae4e8f290312bfa4a1c2a Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 30 May 2017 19:45:59 +0300 Subject: [PATCH 113/171] [HIPIFY] Add the rest CUDA Runtime API 8.0.44 Data structures. + sync with corresponding CUDA Driver API Data structures. P.S. There is no any new changes in CUDA Runtime API 8.0.61 Data structures since 8.0.44. [ROCm/clr commit: 997ed19bb87d22eb3d27f5916522edd9c00f1860] --- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 269 ++++++++++-------- 1 file changed, 147 insertions(+), 122 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 512144f3b9..59d05e69f7 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -205,7 +205,6 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_ARRAY"] = {"hipErrorNotMappedAsArray", CONV_ERR, API_DRIVER}; // 212 cuda2hipRename["CUDA_ERROR_NOT_MAPPED_AS_POINTER"] = {"hipErrorNotMappedAsPointer", CONV_ERR, API_DRIVER}; // 213 cuda2hipRename["CUDA_ERROR_CONTEXT_ALREADY_IN_USE"] = {"hipErrorContextAlreadyInUse", CONV_ERR, API_DRIVER}; // 216 - cuda2hipRename["CUDA_ERROR_NVLINK_UNCORRECTABLE"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_DRIVER}; // 220 [CUDA 8.0.44] cuda2hipRename["CUDA_ERROR_INVALID_SOURCE"] = {"hipErrorInvalidSource", CONV_ERR, API_DRIVER}; // 300 cuda2hipRename["CUDA_ERROR_FILE_NOT_FOUND"] = {"hipErrorFileNotFound", CONV_ERR, API_DRIVER}; // 301 cuda2hipRename["CUDA_ERROR_NOT_FOUND"] = {"hipErrorNotFound", CONV_ERR, API_DRIVER}; // 500 @@ -325,6 +324,9 @@ struct cuda2hipMap { cuda2hipRename["CUDA_ERROR_INVALID_GRAPHICS_CONTEXT"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_DRIVER}; // 219 cuda2hipRename["cudaErrorInvalidGraphicsContext"] = {"hipErrorInvalidGraphicsContext", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 79 + cuda2hipRename["CUDA_ERROR_NVLINK_UNCORRECTABLE"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // 220 [CUDA 8.0.44] + cuda2hipRename["cudaErrorNvlinkUncorrectable"] = {"hipErrorNvlinkUncorrectable", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 80 [CUDA 8.0.44] + cuda2hipRename["CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_DRIVER}; // 302 cuda2hipRename["cudaErrorSharedObjectSymbolNotFound"] = {"hipErrorSharedObjectSymbolNotFound", CONV_ERR, API_RUNTIME, HIP_UNSUPPORTED}; // 40 @@ -434,21 +436,21 @@ struct cuda2hipMap { // unsupported yet by HIP [CUDA 8.0.44] // Memory advise values - cuda2hipRename["CUmem_advise"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUmem_advise"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaComputeMode) // cuda2hipRename["CUmem_advise_enum"] = {"hipMemAdvise", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_MEM_ADVISE_SET_READ_MOSTLY"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_MEM_ADVISE_UNSET_READ_MOSTLY"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_MEM_ADVISE_SET_PREFERRED_LOCATION"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 - cuda2hipRename["CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 - cuda2hipRename["CU_MEM_ADVISE_SET_ACCESSED_BY"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 5 - cuda2hipRename["CU_MEM_ADVISE_UNSET_ACCESSED_BY"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 6 + cuda2hipRename["CU_MEM_ADVISE_SET_READ_MOSTLY"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaMemAdviseSetReadMostly = 1) + cuda2hipRename["CU_MEM_ADVISE_UNSET_READ_MOSTLY"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetReadMostly = 2) + cuda2hipRename["CU_MEM_ADVISE_SET_PREFERRED_LOCATION"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaMemAdviseSetPreferredLocation = 3) + cuda2hipRename["CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetPreferredLocation = 4) + cuda2hipRename["CU_MEM_ADVISE_SET_ACCESSED_BY"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_RUNTIME ANALOGUE (cudaMemAdviseSetAccessedBy = 5) + cuda2hipRename["CU_MEM_ADVISE_UNSET_ACCESSED_BY"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_RUNTIME ANALOGUE (cudaMemAdviseUnsetAccessedBy = 6) // CUmem_range_attribute - cuda2hipRename["CUmem_range_attribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUmem_range_attribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // API_RUNTIME ANALOGUE (cudaMemRangeAttribute) // cuda2hipRename["CUmem_range_attribute_enum"] = {"hipMemRangeAttribute", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 - cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeReadMostly = 1) + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_RUNTIME ANALOGUE (cudaMemRangeAttributePreferredLocation = 2) + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeAccessedBy = 3) + cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeLastPrefetchLocation = 4) // Context flags cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; @@ -502,111 +504,111 @@ struct cuda2hipMap { cuda2hipRename["CUarray"] = {"hipArray *", CONV_TYPE, API_DRIVER}; // API_Runtime ANALOGUE (cudaArray_t) // unsupported yet by HIP - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimY = 3) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimZ = 4) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimX =5) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimY = 6) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimZ - 7) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerBlock = 8) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerBlock = 1) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 2 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimX = 2) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 3 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimY = 3) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"] = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_Runtime ANALOGUE (cudaDevAttrMaxBlockDimZ = 4) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"] = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 5 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimX =5) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"] = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 6 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimY = 6) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"] = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 7 // API_Runtime ANALOGUE (cudaDevAttrMaxGridDimZ - 7) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerBlock = 8) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 9 // API_Runtime ANALOGUE (cudaDevAttrTotalConstantMemory = 9) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 10 // API_Runtime ANALOGUE (cudaDevAttrWarpSize = 10) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 11 // API_Runtime ANALOGUE (cudaDevAttrMaxPitch = 11) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 12) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 13 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 13) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 14 // API_Runtime ANALOGUE (cudaDevAttrTextureAlignment = 14) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"] = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 8 + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"] = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 9 // API_Runtime ANALOGUE (cudaDevAttrTotalConstantMemory = 9) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"] = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 10 // API_Runtime ANALOGUE (cudaDevAttrWarpSize = 10) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"] = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 11 // API_Runtime ANALOGUE (cudaDevAttrMaxPitch = 11) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 12) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"] = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 12 + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"] = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 13 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerBlock = 13) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"] = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 14 // API_Runtime ANALOGUE (cudaDevAttrTextureAlignment = 14) // Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 15 // API_Runtime ANALOGUE (cudaDevAttrGpuOverlap = 15) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 16 // API_Runtime ANALOGUE (cudaDevAttrMultiProcessorCount = 16) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 17 // API_Runtime ANALOGUE (cudaDevAttrKernelExecTimeout = 17) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 18 // API_Runtime ANALOGUE (cudaDevAttrIntegrated = 18) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 19 // API_Runtime ANALOGUE (cudaDevAttrCanMapHostMemory = 19) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER}; // 20 // API_Runtime ANALOGUE (cudaDevAttrComputeMode = 20) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 21 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DWidth = 21) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 22 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DWidth = 22) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 23 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DHeight = 23) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 24 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidth = 24) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 25 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeight = 25) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 26 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepth = 26) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 15 // API_Runtime ANALOGUE (cudaDevAttrGpuOverlap = 15) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"] = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 16 // API_Runtime ANALOGUE (cudaDevAttrMultiProcessorCount = 16) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"] = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 17 // API_Runtime ANALOGUE (cudaDevAttrKernelExecTimeout = 17) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"] = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 18 // API_Runtime ANALOGUE (cudaDevAttrIntegrated = 18) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"] = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 19 // API_Runtime ANALOGUE (cudaDevAttrCanMapHostMemory = 19) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"] = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER}; // 20 // API_Runtime ANALOGUE (cudaDevAttrComputeMode = 20) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"] = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 21 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DWidth = 21) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"] = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 22 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DWidth = 22) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 23 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DHeight = 23) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"] = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 24 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidth = 24) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"] = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 25 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeight = 25) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"] = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 26 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepth = 26) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 27 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredWidth = 27) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 28 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredHeight = 28) // Deprecated, use CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 30 // API_Runtime ANALOGUE (cudaDevAttrSurfaceAlignment = 30) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER}; // 31 // API_Runtime ANALOGUE (cudaDevAttrConcurrentKernels = 31) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 32 // API_Runtime ANALOGUE (cudaDevAttrEccEnabled = 32) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER}; // 33 // API_Runtime ANALOGUE (cudaDevAttrPciBusId = 33) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER}; // 34 // API_Runtime ANALOGUE (cudaDevAttrPciDeviceId = 34) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 35 // API_Runtime ANALOGUE (cudaDevAttrTccDriver = 35) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 36 // API_Runtime ANALOGUE (cudaDevAttrMemoryClockRate = 36) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER}; // 37 // API_Runtime ANALOGUE (cudaDevAttrGlobalMemoryBusWidth = 37) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER}; // 38 // API_Runtime ANALOGUE (cudaDevAttrL2CacheSize = 38) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER}; // 39 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerMultiProcessor = 39) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 40 // API_Runtime ANALOGUE (cudaDevAttrAsyncEngineCount = 40) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 41 // API_Runtime ANALOGUE (cudaDevAttrUnifiedAddressing = 41) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 42 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredWidth = 42) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 43 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredLayers = 43) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"] = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 29 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLayeredLayers = 29) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"] = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 30 // API_Runtime ANALOGUE (cudaDevAttrSurfaceAlignment = 30) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"] = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER}; // 31 // API_Runtime ANALOGUE (cudaDevAttrConcurrentKernels = 31) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"] = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 32 // API_Runtime ANALOGUE (cudaDevAttrEccEnabled = 32) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"] = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER}; // 33 // API_Runtime ANALOGUE (cudaDevAttrPciBusId = 33) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"] = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER}; // 34 // API_Runtime ANALOGUE (cudaDevAttrPciDeviceId = 34) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"] = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 35 // API_Runtime ANALOGUE (cudaDevAttrTccDriver = 35) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"] = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 36 // API_Runtime ANALOGUE (cudaDevAttrMemoryClockRate = 36) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"] = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER}; // 37 // API_Runtime ANALOGUE (cudaDevAttrGlobalMemoryBusWidth = 37) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"] = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER}; // 38 // API_Runtime ANALOGUE (cudaDevAttrL2CacheSize = 38) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER}; // 39 // API_Runtime ANALOGUE (cudaDevAttrMaxThreadsPerMultiProcessor = 39) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"] = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 40 // API_Runtime ANALOGUE (cudaDevAttrAsyncEngineCount = 40) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"] = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 41 // API_Runtime ANALOGUE (cudaDevAttrUnifiedAddressing = 41) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 42 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredWidth = 42) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 43 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLayeredLayers = 43) // deprecated, do not use - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 44 // API_Runtime ANALOGUE (no) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 45 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherWidth = 45) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 46 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherHeight = 46) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 47 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidthAlt = 47) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 48 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeightAlt = 48) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 49 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepthAlt = 49) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 50 // API_Runtime ANALOGUE (cudaDevAttrPciDomainId = 50) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 51 // API_Runtime ANALOGUE (cudaDevAttrTexturePitchAlignment = 51) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 52 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapWidth = 52) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 53 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredWidth = 53) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 54 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredLayers = 54) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 55 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DWidth = 55) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 56 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DWidth = 56) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 57 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DHeight = 57) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 58 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DWidth = 58) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 59 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DHeight = 59) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 60 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DDepth = 60) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 61 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredWidth = 61) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 62 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredLayers = 62) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 63 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredWidth = 63) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredHeight = 64) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 65 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredLayers = 65) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 66 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapWidth = 66) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 67 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 68 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 69 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLinearWidth = 69) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 70 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearWidth = 70) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 71 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearHeight = 71) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 72 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearPitch = 72) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 73 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedWidth = 73) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 74 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedHeight = 74) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER}; // 75 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMajor = 75) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER}; // 76 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMinor = 76) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 77 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DMipmappedWidth = 77) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 78 // API_Runtime ANALOGUE (cudaDevAttrStreamPrioritiesSupported = 78) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 79 // API_Runtime ANALOGUE (cudaDevAttrGlobalL1CacheSupported = 79) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 80 // API_Runtime ANALOGUE (cudaDevAttrLocalL1CacheSupported = 80) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER}; // 81 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 82 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerMultiprocessor = 82) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 83 // API_Runtime ANALOGUE (cudaDevAttrManagedMemory = 83) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER}; // 84 // API_Runtime ANALOGUE (cudaDevAttrIsMultiGpuBoard = 84) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 85 // API_Runtime ANALOGUE (cudaDevAttrMultiGpuBoardGroupID = 85) - cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 86 // API_Runtime ANALOGUE (no) - + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"] = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 44 // API_Runtime ANALOGUE (no) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"] = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 45 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherWidth = 45) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 46 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DGatherHeight = 46) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 47 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DWidthAlt = 47) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 48 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DHeightAlt = 48) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"] = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 49 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture3DDepthAlt = 49) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"] = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 50 // API_Runtime ANALOGUE (cudaDevAttrPciDomainId = 50) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"] = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 51 // API_Runtime ANALOGUE (cudaDevAttrTexturePitchAlignment = 51) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 52 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapWidth = 52) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 53 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredWidth = 53) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 54 // API_Runtime ANALOGUE (cudaDevAttrMaxTextureCubemapLayeredLayers = 54) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"] = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 55 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DWidth = 55) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"] = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 56 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DWidth = 56) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 57 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DHeight = 57) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"] = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 58 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DWidth = 58) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"] = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 59 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DHeight = 59) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"] = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 60 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface3DDepth = 60) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 61 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredWidth = 61) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 62 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface1DLayeredLayers = 62) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 63 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredWidth = 63) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"] = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 64 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredHeight = 64) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 65 // API_Runtime ANALOGUE (cudaDevAttrMaxSurface2DLayeredLayers = 65) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 66 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapWidth = 66) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 67 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredWidth = 67) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"] = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 68 // API_Runtime ANALOGUE (cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 69 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DLinearWidth = 69) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"] = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 70 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearWidth = 70) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 71 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearHeight = 71) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"] = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 72 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DLinearPitch = 72) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 73 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedWidth = 73) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"] = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 74 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture2DMipmappedHeight = 74) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"] = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER}; // 75 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMajor = 75) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"] = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER}; // 76 // API_Runtime ANALOGUE (cudaDevAttrComputeCapabilityMinor = 76) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"] = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 77 // API_Runtime ANALOGUE (cudaDevAttrMaxTexture1DMipmappedWidth = 77) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"] = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 78 // API_Runtime ANALOGUE (cudaDevAttrStreamPrioritiesSupported = 78) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 79 // API_Runtime ANALOGUE (cudaDevAttrGlobalL1CacheSupported = 79) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"] = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 80 // API_Runtime ANALOGUE (cudaDevAttrLocalL1CacheSupported = 80) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER}; // 81 // API_Runtime ANALOGUE (cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 82 // API_Runtime ANALOGUE (cudaDevAttrMaxRegistersPerMultiprocessor = 82) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"] = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 83 // API_Runtime ANALOGUE (cudaDevAttrManagedMemory = 83) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"] = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER}; // 84 // API_Runtime ANALOGUE (cudaDevAttrIsMultiGpuBoard = 84) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"] = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 85 // API_Runtime ANALOGUE (cudaDevAttrMultiGpuBoardGroupID = 85) // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 86 // API_Runtime ANALOGUE (cudaDevAttrHostNativeAtomicSupported = 86) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 87 // API_Runtime ANALOGUE (cudaDevAttrSingleToDoublePrecisionPerfRatio = 87) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 88 // API_Runtime ANALOGUE (cudaDevAttrPageableMemoryAccess = 88) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 89 // API_Runtime ANALOGUE (cudaDevAttrConcurrentManagedAccess = 89) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 90 // API_Runtime ANALOGUE (cudaDevAttrComputePreemptionSupported = 90) + cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 91 // API_Runtime ANALOGUE (cudaDevAttrCanUseHostPointerForRegisteredMem = 91) + + cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"] = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 92 // API_Runtime ANALOGUE (no) cuda2hipRename["CUdevprop_st"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; cuda2hipRename["CUdevprop"] = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER}; @@ -929,11 +931,11 @@ struct cuda2hipMap { // unsupported yet by HIP [CUDA 8.0.44] // P2P Attributes - cuda2hipRename["CUdevice_P2PAttribute"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CUdevice_P2PAttribute"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaDeviceP2PAttr) // cuda2hipRename["CUdevice_P2PAttribute_enum"] = {"hipDeviceP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (cudaDevP2PAttrPerformanceRank = 0x01) + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaDevP2PAttrAccessSupported = 0x02) + cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaDevP2PAttrNativeAtomicSupported = 0x03) // Events // pointer to CUevent_st @@ -1352,12 +1354,12 @@ struct cuda2hipMap { cuda2hipRename["cudaDevAttrMultiGpuBoardGroupID"] = {"hipDeviceAttributeMultiGpuBoardGroupID", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 85 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID = 85) // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrPageableMemoryAccess"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrConcurrentManagedAccess"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrComputePreemptionSupported"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; - cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"] = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 86 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED = 86) + cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"] = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 87 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO = 87) + cuda2hipRename["cudaDevAttrPageableMemoryAccess"] = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 88 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS = 88) + cuda2hipRename["cudaDevAttrConcurrentManagedAccess"] = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 89 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS = 89) + cuda2hipRename["cudaDevAttrComputePreemptionSupported"] = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 90 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED = 90) + cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 91 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM = 91) // Pointer Attributes // struct cudaPointerAttributes @@ -1375,6 +1377,13 @@ struct cuda2hipMap { cuda2hipRename["cudaDeviceGetStreamPriorityRange"] = {"hipDeviceGetStreamPriorityRange", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaSetValidDevices"] = {"hipSetValidDevices", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] + // P2P Attributes + cuda2hipRename["cudaDeviceP2PAttr"] = {"hipDeviceP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (CUdevice_P2PAttribute) + cuda2hipRename["cudaDevP2PAttrPerformanceRank"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01) + cuda2hipRename["cudaDevP2PAttrAccessSupported"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02) + cuda2hipRename["cudaDevP2PAttrNativeAtomicSupported"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03) + // Compute mode cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) cuda2hipRename["cudaComputeModeDefault"] = {"hipComputeModeDefault", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0 // API_DRIVER ANALOGUE (CU_COMPUTEMODE_DEFAULT = 0) @@ -1538,7 +1547,7 @@ struct cuda2hipMap { cuda2hipRename["cudaResourceTypeLinear"] = {"hipResourceTypeLinear", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_LINEAR = 0x02) cuda2hipRename["cudaResourceTypePitch2D"] = {"hipResourceTypePitch2D", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Driver ANALOGUE (CU_RESOURCE_TYPE_PITCH2D = 0x03) - + // enum cudaResourceViewFormat cuda2hipRename["cudaResourceViewFormat"] = {"hipResourceViewFormat", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUresourceViewFormat) cuda2hipRename["cudaResViewFormatNone"] = {"hipResViewFormatNone", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x00 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_NONE = 0x00) cuda2hipRename["cudaResViewFormatUnsignedChar1"] = {"hipResViewFormatUnsignedChar1", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Driver ANALOGUE (CU_RES_VIEW_FORMAT_UINT_1X8 = 0x01) @@ -1582,6 +1591,22 @@ struct cuda2hipMap { cuda2hipRename["cudaAddressModeMirror"] = {"hipAddressModeMirror", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaAddressModeBorder"] = {"hipAddressModeBorder", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] + // Memory advise values + cuda2hipRename["cudaMemoryAdvise"] = {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_advise) + cuda2hipRename["cudaMemAdviseSetReadMostly"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_READ_MOSTLY = 1) + cuda2hipRename["cudaMemAdviseUnsetReadMostly"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2) + cuda2hipRename["cudaMemAdviseSetPreferredLocation"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3) + cuda2hipRename["cudaMemAdviseUnsetPreferredLocation"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4) + cuda2hipRename["cudaMemAdviseSetAccessedBy"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 5 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_ACCESSED_BY = 5) + cuda2hipRename["cudaMemAdviseUnsetAccessedBy"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 6 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6) + // CUmem_range_attribute + cuda2hipRename["cudaMemRangeAttribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_range_attribute) + cuda2hipRename["cudaMemRangeAttributeReadMostly"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1) + cuda2hipRename["cudaMemRangeAttributePreferredLocation"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2) + cuda2hipRename["cudaMemRangeAttributeAccessedBy"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3) + cuda2hipRename["cudaMemRangeAttributeLastPrefetchLocation"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4) + // functions cuda2hipRename["cudaCreateTextureObject"] = {"hipCreateTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaDestroyTextureObject"] = {"hipDestroyTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; From 81354999e81ece299b087ba8110ac24d370a391d Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 30 May 2017 21:54:33 -0500 Subject: [PATCH 114/171] Set event->_stream on hipHccModuleLaunchKernel path if start/stop used Ensure _stream is always non-null in recorded events. Fixes isDefaultStream fault. [ROCm/clr commit: 6cc5dc03265d50e4ae511c66e25854bab51fe40e] --- projects/clr/hipamd/src/hip_event.cpp | 4 +++- projects/clr/hipamd/src/hip_hcc_internal.h | 3 ++- projects/clr/hipamd/src/hip_module.cpp | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index 2c31769718..8ef652489a 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -42,11 +42,13 @@ ihipEvent_t::ihipEvent_t(unsigned flags) // Attach to an existing completion future: -void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType) +void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, + hipStream_t stream, ihipEventType_t eventType) { _state = hipEventStatusRecording; _marker = *cf; _type = eventType; + _stream = stream; } diff --git a/projects/clr/hipamd/src/hip_hcc_internal.h b/projects/clr/hipamd/src/hip_hcc_internal.h index 94ad4f9340..b15d5a73e4 100644 --- a/projects/clr/hipamd/src/hip_hcc_internal.h +++ b/projects/clr/hipamd/src/hip_hcc_internal.h @@ -538,6 +538,7 @@ public: const ihipDevice_t * getDevice() const; ihipCtx_t * getCtx() const; + // Before calling this function, stream must be resolved from "0" to the actual stream: bool isDefaultStream() const { return _id == 0; }; public: @@ -602,7 +603,7 @@ enum ihipEventType_t { class ihipEvent_t { public: ihipEvent_t(unsigned flags); - void attachToCompletionFuture(const hc::completion_future *cf, ihipEventType_t eventType); + void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType); void setTimestamp(); uint64_t timestamp() const { return _timestamp; } ; ihipEventType_t type() const { return _type; }; diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index d364a6b519..2a3bfabc28 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -455,10 +455,10 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, if (startEvent) { - startEvent->attachToCompletionFuture(&cf, hipEventTypeStartCommand); + startEvent->attachToCompletionFuture(&cf, hStream, hipEventTypeStartCommand); } if (stopEvent) { - stopEvent->attachToCompletionFuture (&cf, hipEventTypeStopCommand); + stopEvent->attachToCompletionFuture (&cf, hStream, hipEventTypeStopCommand); } From b22fdeb171f0ac4031ff01da40fca035553b8ad3 Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 4 May 2017 13:57:01 +0530 Subject: [PATCH 115/171] Print msg for single gpu Change-Id: I2d23c73542add8973990ba96592016726994422e [ROCm/clr commit: e104c2e3bfebe2819735d1f1c9c30c63c84b98c7] --- .../samples/2_Cookbook/8_peer2peer/peer2peer.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/projects/clr/hipamd/samples/2_Cookbook/8_peer2peer/peer2peer.cpp b/projects/clr/hipamd/samples/2_Cookbook/8_peer2peer/peer2peer.cpp index 990599e1cb..0f532a2f0a 100644 --- a/projects/clr/hipamd/samples/2_Cookbook/8_peer2peer/peer2peer.cpp +++ b/projects/clr/hipamd/samples/2_Cookbook/8_peer2peer/peer2peer.cpp @@ -55,13 +55,9 @@ void checkPeer2PeerSupport() { int gpuCount; int canAccessPeer; - int p2pCapableDeviceCount=0; HIPCHECK(hipGetDeviceCount(&gpuCount)); - if (gpuCount < 2) - printf("Peer2Peer application requires atleast 2 gpu devices"); - for (int currentGpu=0; currentGpu Date: Thu, 11 May 2017 11:30:49 +0530 Subject: [PATCH 116/171] Add unroll and inline asm cookbook samples Change-Id: Ie5a0fbb01b7fca82959090d89299533d49e092f1 [ROCm/clr commit: 5696eaf84213f9fb54c6282b9b8b7724977da92f] --- .../samples/2_Cookbook/10_inline_asm/Makefile | 35 ++++ .../2_Cookbook/10_inline_asm/inline_asm.cpp | 174 ++++++++++++++++++ .../samples/2_Cookbook/9_unroll/Makefile | 39 ++++ .../samples/2_Cookbook/9_unroll/unroll.cpp | 141 ++++++++++++++ 4 files changed, 389 insertions(+) create mode 100644 projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Makefile create mode 100644 projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/inline_asm.cpp create mode 100644 projects/clr/hipamd/samples/2_Cookbook/9_unroll/Makefile create mode 100644 projects/clr/hipamd/samples/2_Cookbook/9_unroll/unroll.cpp diff --git a/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Makefile b/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Makefile new file mode 100644 index 0000000000..77a7699635 --- /dev/null +++ b/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Makefile @@ -0,0 +1,35 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = inline_asm.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./inline_asm + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o diff --git a/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/inline_asm.cpp b/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/inline_asm.cpp new file mode 100644 index 0000000000..2b4fc3de90 --- /dev/null +++ b/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/inline_asm.cpp @@ -0,0 +1,174 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define WIDTH 1024 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + + asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x])); +} + +// CPU implementation of matrix transpose +void matrixTransposeCPUReference( + float * output, + float * input, + const unsigned int width) +{ + for(unsigned int j=0; j < width; j++) + { + for(unsigned int i=0; i < width; i++) + { + output[i*width + j] = input[j*width + i]; + } + } +} + +int main() { + + float* Matrix; + float* TransposeMatrix; + float* cpuTransposeMatrix; + + float* gpuMatrix; + float* gpuTransposeMatrix; + + hipDeviceProp_t devProp; + hipGetDeviceProperties(&devProp, 0); + + std::cout << "Device name " << devProp.name << std::endl; + + hipEvent_t start, stop; + hipEventCreate(&start); + hipEventCreate(&stop); + float eventMs = 1.0f; + + int i; + int errors; + + Matrix = (float*)malloc(NUM * sizeof(float)); + TransposeMatrix = (float*)malloc(NUM * sizeof(float)); + cpuTransposeMatrix = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + Matrix[i] = (float)i*10.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuMatrix, NUM * sizeof(float)); + hipMalloc((void**)&gpuTransposeMatrix, NUM * sizeof(float)); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from host to device + hipMemcpy(gpuMatrix, Matrix, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyHostToDevice time taken = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Lauching kernel from host + hipLaunchKernel(matrixTranspose, + dim3(WIDTH/THREADS_PER_BLOCK_X, WIDTH/THREADS_PER_BLOCK_Y), + dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), + 0, 0, + gpuTransposeMatrix , gpuMatrix, WIDTH); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("kernel Execution time = %6.3fms\n", eventMs); + + // Record the start event + hipEventRecord(start, NULL); + + // Memory transfer from device to host + hipMemcpy(TransposeMatrix, gpuTransposeMatrix, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // Record the stop event + hipEventRecord(stop, NULL); + hipEventSynchronize(stop); + + hipEventElapsedTime(&eventMs, start, stop); + + printf ("hipMemcpyDeviceToHost time taken = %6.3fms\n", eventMs); + + // CPU MatrixTranspose computation + matrixTransposeCPUReference(cpuTransposeMatrix, Matrix, WIDTH); + + // verify the results + errors = 0; + double eps = 1.0E-6; + for (i = 0; i < NUM; i++) { + if (std::abs(TransposeMatrix[i] - cpuTransposeMatrix[i]) > eps ) { + printf("gpu%f cpu %f \n",TransposeMatrix[i],cpuTransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} diff --git a/projects/clr/hipamd/samples/2_Cookbook/9_unroll/Makefile b/projects/clr/hipamd/samples/2_Cookbook/9_unroll/Makefile new file mode 100644 index 0000000000..b71f3d8353 --- /dev/null +++ b/projects/clr/hipamd/samples/2_Cookbook/9_unroll/Makefile @@ -0,0 +1,39 @@ +HIP_PATH?= $(wildcard /opt/rocm/hip) +ifeq (,$(HIP_PATH)) + HIP_PATH=../../.. +endif + +ifeq (gfx701, $(findstring gfx701,$(HCC_AMDGPU_TARGET))) + $(error gfx701 is not a supported device for this sample) +endif + +HIPCC=$(HIP_PATH)/bin/hipcc + +TARGET=hcc + +SOURCES = unroll.cpp +OBJECTS = $(SOURCES:.cpp=.o) + +EXECUTABLE=./unroll + +.PHONY: test + + +all: $(EXECUTABLE) test + +CXXFLAGS =-g +CXX=$(HIPCC) + + +$(EXECUTABLE): $(OBJECTS) + $(HIPCC) $(OBJECTS) -o $@ + + +test: $(EXECUTABLE) + $(EXECUTABLE) + + +clean: + rm -f $(EXECUTABLE) + rm -f $(OBJECTS) + rm -f $(HIP_PATH)/src/*.o diff --git a/projects/clr/hipamd/samples/2_Cookbook/9_unroll/unroll.cpp b/projects/clr/hipamd/samples/2_Cookbook/9_unroll/unroll.cpp new file mode 100644 index 0000000000..22f1c75e6e --- /dev/null +++ b/projects/clr/hipamd/samples/2_Cookbook/9_unroll/unroll.cpp @@ -0,0 +1,141 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + + +#define WIDTH 4 + +#define NUM (WIDTH*WIDTH) + +#define THREADS_PER_BLOCK_X 4 +#define THREADS_PER_BLOCK_Y 4 +#define THREADS_PER_BLOCK_Z 1 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void matrixTranspose(hipLaunchParm lp, + float *out, + float *in, + const int width) +{ + int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + float val = in[x]; + +#pragma unroll + for(int i=0;i eps ) { + printf("%d cpu: %f gpu %f\n",i,cpuTransposeMatrix[i],TransposeMatrix[i]); + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuMatrix); + hipFree(gpuTransposeMatrix); + + //free the resources on host side + free(Matrix); + free(TransposeMatrix); + free(cpuTransposeMatrix); + + return errors; +} From c3167f463d5d7ab6948231961ab2cb21eef41ad9 Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 11 May 2017 16:33:31 +0530 Subject: [PATCH 117/171] Add inline asm hip directed tests for v_add and v_mac Change-Id: Ie5ace2e42d5da89b16e040537df2bb13d3883c6d [ROCm/clr commit: c964a5f208ed650f30900e9244bf65551e12ce00] --- .../tests/src/kernel/inline_asm_vadd.cpp | 126 ++++++++++++++++++ .../tests/src/kernel/inline_asm_vmac.cpp | 125 +++++++++++++++++ 2 files changed, 251 insertions(+) create mode 100644 projects/clr/hipamd/tests/src/kernel/inline_asm_vadd.cpp create mode 100644 projects/clr/hipamd/tests/src/kernel/inline_asm_vmac.cpp diff --git a/projects/clr/hipamd/tests/src/kernel/inline_asm_vadd.cpp b/projects/clr/hipamd/tests/src/kernel/inline_asm_vadd.cpp new file mode 100644 index 0000000000..481b606e89 --- /dev/null +++ b/projects/clr/hipamd/tests/src/kernel/inline_asm_vadd.cpp @@ -0,0 +1,126 @@ +/* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and +associated documentation files (the "Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the +following conditions: + +The above copyright notice and this permission notice shall be included in all copies or substantial +portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT +LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO +EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR +THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ + +/* HIT_START + * BUILD: %t %s + * RUN: %t + * HIT_END + */ + + +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define NUM 1024 + +#define THREADS_PER_BLOCK_X 4 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void vadd_asm(hipLaunchParm lp, + float *out, + float *in) +{ + int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + + asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i])); +} + +// CPU implementation of Vector Result +void addCPUReference( + float * output, + float * input) +{ + for(unsigned int j=0; j < NUM; j++) + { + + output[j]= input[j] + output[j]; + } +} + +int main(){ + + float* VectorA; + float* ResultVector; + float* VectorB; + + float* gpuVector; + float* gpuResultVector; + + int i; + int errors; + + VectorA = (float*)malloc(NUM * sizeof(float)); + ResultVector = (float*)malloc(NUM * sizeof(float)); + VectorB = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + VectorA[i] = (float)i*10.0f; + VectorB[i] = (float)i*30.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuVector, NUM * sizeof(float)); + hipMalloc((void**)&gpuResultVector, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(vadd_asm, + dim3(NUM/THREADS_PER_BLOCK_X), + dim3(THREADS_PER_BLOCK_X), + 0, 0, + gpuResultVector , gpuVector); + + // Memory transfer from device to host + hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU Result computation + addCPUReference(VectorB, VectorA); + + // verify the results + errors = 0; + double eps = 1.0E-3; + for (i = 0; i < NUM; i++) { + if (std::abs(ResultVector[i] - VectorB[i]) > eps ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuVector); + hipFree(gpuResultVector); + + hipDeviceReset(); + + //free the resources on host side + free(VectorA); + free(ResultVector); + free(VectorB); + + return errors; +} diff --git a/projects/clr/hipamd/tests/src/kernel/inline_asm_vmac.cpp b/projects/clr/hipamd/tests/src/kernel/inline_asm_vmac.cpp new file mode 100644 index 0000000000..1b6941c249 --- /dev/null +++ b/projects/clr/hipamd/tests/src/kernel/inline_asm_vmac.cpp @@ -0,0 +1,125 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include + +// hip header file +#include "hip/hip_runtime.h" + +#define NUM 1024 + +#define THREADS_PER_BLOCK_X 4 + +// Device (Kernel) function, it must be void +// hipLaunchParm provides the execution configuration +__global__ void vmac_asm(hipLaunchParm lp, + float *out, + float *in) +{ + int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; + + asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i])); +} + +// CPU implementation of saxpy +void CPUReference( + float * output, + float * input) +{ + for(unsigned int j=0; j < NUM; j++) + { + + output[j]= a*input[j] + output[j]; + } +} + +int main(){ + + float* VectorA; + float* ResultVector; + float* VectorB; + + float* gpuVector; + float* gpuResultVector; + + const float a = 10.0f + int i; + int errors; + + VectorA = (float*)malloc(NUM * sizeof(float)); + ResultVector = (float*)malloc(NUM * sizeof(float)); + VectorB = (float*)malloc(NUM * sizeof(float)); + + // initialize the input data + for (i = 0; i < NUM; i++) { + VectorA[i] = (float)i*10.0f; + VectorB[i] = (float)i*30.0f; + } + + // allocate the memory on the device side + hipMalloc((void**)&gpuVector, NUM * sizeof(float)); + hipMalloc((void**)&gpuResultVector, NUM * sizeof(float)); + + // Memory transfer from host to device + hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice); + hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice); + + // Lauching kernel from host + hipLaunchKernel(vmac_asm, + dim3(NUM/THREADS_PER_BLOCK_X), + dim3(THREADS_PER_BLOCK_X), + 0, 0, + gpuResultVector , gpuVector); + + // Memory transfer from device to host + hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost); + + // CPU Result computation + addCPUReference(VectorB, VectorA); + + // verify the results + errors = 0; + double eps = 1.0E-3; + for (i = 0; i < NUM; i++) { + if (std::abs(ResultVector[i] - VectorB[i]) > eps ) { + errors++; + } + } + if (errors!=0) { + printf("FAILED: %d errors\n",errors); + } else { + printf ("PASSED!\n"); + } + + //free the resources on device side + hipFree(gpuVector); + hipFree(gpuResultVector); + + hipDeviceReset(); + + //free the resources on host side + free(VectorA); + free(ResultVector); + free(VectorB); + + return errors; +} From ac8089e77351b1d54a4f20347b9c54e0f1a5b8de Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Thu, 11 May 2017 18:43:24 +0530 Subject: [PATCH 118/171] Add readme for inline asm and unroll cookbook samples Change-Id: I71b7a5652c3dad181c5df60ab0dd1b81d79f1bfb [ROCm/clr commit: f6b98854bac773272ab9b58653afdb9f78221687] --- .../2_Cookbook/10_inline_asm/Readme.md | 47 ++++++++++++++++++ .../samples/2_Cookbook/9_unroll/Readme.md | 48 +++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Readme.md create mode 100644 projects/clr/hipamd/samples/2_Cookbook/9_unroll/Readme.md diff --git a/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Readme.md b/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Readme.md new file mode 100644 index 0000000000..8c98547220 --- /dev/null +++ b/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Readme.md @@ -0,0 +1,47 @@ +## inline asm ### + +This tutorial is about how to use inline GCN asm in kernel. In this tutorial, we'll explain how to by using the simple Matrix Transpose. + +## Introduction: + +If you want to take advantage of the extra performance benefits of writing in assembly as well as take advantage of special GPU hardware features that were only available through assemby, then this tutorial is for you. In this tutorial we'll be explaining how to start writing inline asm in kernel. + +For more insight Please read the following blogs by Ben Sander +[The Art of AMDGCN Assembly: How to Bend the Machine to Your Will](gpuopen.com/amdgcn-assembly) +[AMD GCN Assembly: Cross-Lane Operations](http://gpuopen.com/amd-gcn-assembly-cross-lane-operations/) + +For more information: +[AMD GCN3 ISA Architecture Manual](http://gpuopen.com/compute-product/amd-gcn3-isa-architecture-manual/) +[User Guide for AMDGPU Back-end](llvm.org/docs/AMDGPUUsage.html) + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +We will be using the Simple Matrix Transpose application from the our very first tutorial. + +## asm() Assembler statement + +We insert the GCN isa into the kernel using asm() Assembler statement. In the same sourcecode, we used for MatrixTranspose. We'll add the following: + +` asm volatile ("v_mov_b32_e32 %0, %1" : "=v" (out[x*width + y]) : "v" (in[y*width + x])); ` + +## How to build and run: +Use the make command and execute it using ./exe +Use hipcc to build the application, which is using hcc on AMD and nvcc on nvidia. + + +## More Info: +- [HIP FAQ](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_faq.md) +- [HIP Kernel Language](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_kernel_language.md) +- [HIP Runtime API (Doxygen)](http://gpuopen-professionalcompute-tools.github.io/HIP) +- [HIP Porting Guide](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_porting_guide.md) +- [HIP Terminology](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/docs/markdown/hip_terms.md) (including Rosetta Stone of GPU computing terms across CUDA/HIP/HC/AMP/OpenL) +- [clang-hipify](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/clang-hipify/README.md) +- [Developer/CONTRIBUTING Info](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/CONTRIBUTING.md) +- [Release Notes](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/RELEASE.md) diff --git a/projects/clr/hipamd/samples/2_Cookbook/9_unroll/Readme.md b/projects/clr/hipamd/samples/2_Cookbook/9_unroll/Readme.md new file mode 100644 index 0000000000..3c2635c0eb --- /dev/null +++ b/projects/clr/hipamd/samples/2_Cookbook/9_unroll/Readme.md @@ -0,0 +1,48 @@ +## Using Pragma unroll ### + +In this tutorial, we'll explain how to use #pragma unroll to improve the performance. + +## Introduction: + +Loop unrolling optimization hints can be specified with #pragma unroll and #pragma nounroll. The pragma is placed immediately before a for loop. +Specifying #pragma unroll without a parameter directs the loop unroller to attempt to fully unroll the loop if the trip count is known at compile time and attempt to partially unroll the loop if the trip count is not known at compile time. + +## Requirement: +For hardware requirement and software installation [Installation](https://github.com/GPUOpen-ProfessionalCompute-Tools/HIP/INSTALL.md) + +## prerequiste knowledge: + +Programmers familiar with CUDA, OpenCL will be able to quickly learn and start coding with the HIP API. In case you are not, don't worry. You choose to start with the best one. We'll be explaining everything assuming you are completely new to gpgpu programming. + +## Simple Matrix Transpose + +For this tutorial we will be using MatrixTranspose with shfl operation i.e., our 4_shfl tutorial since it is the only examples where we used loops inside the kernel. + +In this tutorial, we'll use `#pragma unroll`. In the same sourcecode, we used for MatrixTranspose. We'll add it just before the for loop as following: + +`#pragma unroll ` +` for(int i=0;i Date: Wed, 31 May 2017 10:15:41 +0530 Subject: [PATCH 119/171] Disable rcbrt, scalbln and scalbn double precision device test Change-Id: I46bd895701c46d3592b553090cafba99e41a2e2d [ROCm/clr commit: da19087ae2693d05122c07e5fa4521d876f2f846] --- .../tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp index df5dad3968..f4f7ab0479 100644 --- a/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp +++ b/projects/clr/hipamd/tests/src/deviceLib/hipDoublePrecisionMathDevice.cpp @@ -99,7 +99,7 @@ __device__ void double_precision_math_functions() normcdf(0.0); normcdfinv(1.0); pow(1.0, 0.0); - rcbrt(1.0); + //rcbrt(1.0); remainder(2.0, 1.0); // remquo(1.0, 2.0, &iX); rhypot(0.0, 1.0); @@ -109,8 +109,8 @@ __device__ void double_precision_math_functions() rnorm4d(0.0, 0.0, 0.0, 1.0); round(0.0); rsqrt(1.0); - scalbln(0.0, 1); - scalbn(0.0, 1); + //scalbln(0.0, 1); + //scalbn(0.0, 1); signbit(1.0); sin(0.0); sincos(0.0, &fX, &fY); From 2985fa381488b272c27f39b53a7cc8c06c103cc0 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 31 May 2017 10:16:19 +0530 Subject: [PATCH 120/171] Disable rcbrtf, scalblnf, scalbnf in single precision device test Change-Id: I8a250a64a0cb05132d022a11d9766ced9cdf11a7 [ROCm/clr commit: 2145e94049369eb835faaa33385c12f39b3cc220] --- .../tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp index 53ccd2251f..de3dec35ef 100644 --- a/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp +++ b/projects/clr/hipamd/tests/src/deviceLib/hipSinglePrecisionMathDevice.cpp @@ -100,7 +100,7 @@ __device__ void single_precision_math_functions() normcdfinvf(1.0f); fX = 1.0f; normf(1, &fX); powf(1.0f, 0.0f); - rcbrtf(1.0f); + //rcbrtf(1.0f); remainderf(2.0f, 1.0f); //remquof(1.0f, 2.0f, &iX); rhypotf(0.0f, 1.0f); @@ -110,8 +110,8 @@ __device__ void single_precision_math_functions() fX = 1.0f; rnormf(1, &fX); roundf(0.0f); rsqrtf(1.0f); - scalblnf(0.0f, 1); - scalbnf(0.0f, 1); + //scalblnf(0.0f, 1); + //scalbnf(0.0f, 1); signbit(1.0f); sincosf(0.0f, &fX, &fY); sincospif(0.0f, &fX, &fY); From 0e4e17db27cd915e80ea885389c89dd0bf77212e Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 31 May 2017 10:16:57 +0530 Subject: [PATCH 121/171] Fix hipMemoryAllocate test for single GPU Change-Id: If121c18ab490ba125dc689ffc08a8839fd280c38 [ROCm/clr commit: 06ee0d3704acf23031fc6a0388ee0663f91fae28] --- .../tests/src/runtimeApi/memory/hipMemoryAllocate.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp index 1ee5cbc9bb..34951f0a09 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp @@ -36,7 +36,6 @@ void multiGpuHostAlloc(int allocDevice) int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); - assert(numDevices > 1); printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices); @@ -121,10 +120,12 @@ int main(int argc, char *argv[]) { int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); - assert(numDevices > 1); multiGpuHostAlloc(0); - multiGpuHostAlloc(1); + if (numDevices > 1) + { + multiGpuHostAlloc(1); + } } passed(); From afbf55a9dc230bb5304eb4c5b482aea6a578ab64 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 31 May 2017 18:55:29 +0300 Subject: [PATCH 122/171] [HIP] [HIPIFY] CUDA Driver API 8.0.44 JIT options support. [ROCm/clr commit: 463c026976730fd7c61e601e4b7c0a217cae50b0] --- projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 4 ++-- projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h | 2 ++ projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h | 2 ++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 59d05e69f7..9c22fde573 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -718,8 +718,8 @@ struct cuda2hipMap { cuda2hipRename["CU_JIT_GENERATE_LINE_INFO"] = {"hipJitOptionGenerateLineInfo", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_CACHE_MODE"] = {"hipJitOptionCacheMode", CONV_JIT, API_DRIVER}; // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["CU_JIT_NEW_SM3X_OPT"] = {"hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_JIT_FAST_COMPILE"] = {"hipJitOptionFastCompile", CONV_JIT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_JIT_NEW_SM3X_OPT"] = {"hipJitOptionSm3xOpt", CONV_JIT, API_DRIVER}; + cuda2hipRename["CU_JIT_FAST_COMPILE"] = {"hipJitOptionFastCompile", CONV_JIT, API_DRIVER}; cuda2hipRename["CU_JIT_NUM_OPTIONS"] = {"hipJitOptionNumOptions", CONV_JIT, API_DRIVER}; // enum CUjit_target/CUjit_target_enum diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 6059e1e92d..25eac31ec6 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -158,6 +158,8 @@ typedef enum hipJitOption { hipJitOptionLogVerbose, hipJitOptionGenerateLineInfo, hipJitOptionCacheMode, + hipJitOptionSm3xOpt, + hipJitOptionFastCompile, hipJitOptionNumOptions } hipJitOption; diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index b09c9323c7..f92523a3e3 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -94,6 +94,8 @@ hipMemcpyHostToHost #define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE #define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO #define hipJitOptionCacheMode CU_JIT_CACHE_MODE +#define hipJitOptionSm3xOpt CU_JIT_NEW_SM3X_OPT +#define hipJitOptionFastCompile CU_JIT_FAST_COMPILE #define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS typedef cudaEvent_t hipEvent_t; From 8514cf513a5e52a194473260084b03e94535d98b Mon Sep 17 00:00:00 2001 From: Siu Chi Chan Date: Wed, 31 May 2017 15:19:26 -0400 Subject: [PATCH 123/171] fix atomicCAS:remove load for the return value after CAS [ROCm/clr commit: 969931b1cee92f9072fe2321d9b6df61fb63baf1] --- projects/clr/hipamd/src/device_util.cpp | 34 ++++++++++++++++++++----- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/projects/clr/hipamd/src/device_util.cpp b/projects/clr/hipamd/src/device_util.cpp index bea42aba46..e59a44e5ba 100644 --- a/projects/clr/hipamd/src/device_util.cpp +++ b/projects/clr/hipamd/src/device_util.cpp @@ -26,6 +26,7 @@ THE SOFTWARE. #include "device_util.h" #include "hip/hcc_detail/device_functions.h" #include "hip/hip_runtime.h" +#include //================================================================================================= /* @@ -923,24 +924,45 @@ __device__ unsigned long long int atomicMax(unsigned long long int* address, } //atomicCAS() +template +__device__ T atomicCAS_impl(T* address, T compare, T val) +{ + // the implementation assumes the atomic is lock-free and + // has the same size as the non-atmoic equivalent type + static_assert(sizeof(T) == sizeof(std::atomic) + , "size mismatch between atomic and non-atomic types"); + + union { + T* address; + std::atomic* atomic_address; + } u; + u.address = address; + + T expected = compare; + + // hcc should generate a system scope atomic CAS + std::atomic_compare_exchange_weak_explicit(u.atomic_address + , &expected, val + , std::memory_order_acq_rel + , std::memory_order_relaxed); + return expected; +} + __device__ int atomicCAS(int* address, int compare, int val) { - hc::atomic_compare_exchange(address,&compare,val); - return *address; + return atomicCAS_impl(address, compare, val); } __device__ unsigned int atomicCAS(unsigned int* address, unsigned int compare, unsigned int val) { - hc::atomic_compare_exchange(address,&compare,val); - return *address; + return atomicCAS_impl(address, compare, val); } __device__ unsigned long long int atomicCAS(unsigned long long int* address, unsigned long long int compare, unsigned long long int val) { - hc::atomic_compare_exchange((uint64_t*)address,(uint64_t*)&compare,(uint64_t)val); - return *address; + return atomicCAS_impl(address, compare, val); } //atomicAnd() From b30b1acc5c2cf3219c67a21f97af25e3707d46fc Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 1 Jun 2017 21:08:33 +0300 Subject: [PATCH 124/171] [HIPIFY] All CUDA 8.0.44 API functions update (for both Driver and Runtime APIs) 1) P2P cuDeviceGetP2PAttribute cudaDeviceGetP2PAttribute 2) Memory Mngmnt cuMemPrefetchAsync cudaMemPrefetchAsync cuMemAdvise cudaMemAdvise cuMemRangeGetAttribute cudaMemRangeGetAttribute cuMemRangeGetAttributes cudaMemRangeGetAttributes 3) Streams (Driver API only, no analogues in Runtime API) cuStreamWaitValue32 cuStreamWaitValue32 cuStreamWriteValue32 4) Texture Reference Mngmnt (Driver API only, no analogues in Runtime API) cuTexRefSetBorderColor cuTexRefGetBorderColor [ROCm/clr commit: ee85243bcdadef1b31b8b0fe4fc14354d3d128e5] --- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 59 ++++++++++++------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 9c22fde573..0825285b51 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -937,6 +937,8 @@ struct cuda2hipMap { cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (cudaDevP2PAttrAccessSupported = 0x02) cuda2hipRename["CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (cudaDevP2PAttrNativeAtomicSupported = 0x03) + cuda2hipRename["cuDeviceGetP2PAttribute"] = {"hipDeviceGetP2PAttribute", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (cudaDeviceGetP2PAttribute) + // Events // pointer to CUevent_st cuda2hipRename["CUevent"] = {"hipEvent_t", CONV_TYPE, API_DRIVER}; @@ -973,6 +975,9 @@ struct cuda2hipMap { // Streams // unsupported yet by HIP cuda2hipRename["cuStreamAddCallback"] = {"hipStreamAddCallback", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuStreamWaitValue32"] = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuStreamWriteValue32"] = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuStreamBatchMemOp"] = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE cuda2hipRename["cuStreamCreate"] = {"hipStreamCreate", CONV_STREAM, API_DRIVER}; cuda2hipRename["cuStreamDestroy_v2"] = {"hipStreamDestroy", CONV_STREAM, API_DRIVER}; @@ -1014,6 +1019,11 @@ struct cuda2hipMap { cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemPrefetchAsync"] = {"hipMemPrefetchAsync___", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature) + cuda2hipRename["cuMemAdvise"] = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemAdvise) + cuda2hipRename["cuMemRangeGetAttribute"] = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttribute) + cuda2hipRename["cuMemRangeGetAttributes"] = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttributes) + // Texture Reference Mngmnt // Texture reference filtering modes cuda2hipRename["CUfilter_mode"] = {"hipTextureFilterMode", CONV_TEX, API_DRIVER}; // API_Runtime ANALOGUE (cudaTextureFilterMode) @@ -1022,6 +1032,9 @@ struct cuda2hipMap { cuda2hipRename["CU_TR_FILTER_MODE_POINT"] = {"hipFilterModePoint", CONV_TEX, API_DRIVER}; // 0 // API_Runtime ANALOGUE (cudaFilterModePoint = 0) cuda2hipRename["CU_TR_FILTER_MODE_LINEAR"] = {"hipFilterModeLinear", CONV_TEX, API_DRIVER, HIP_UNSUPPORTED}; // 1 // API_Runtime ANALOGUE (cudaFilterModeLinear = 1) + cuda2hipRename["cuTexRefSetBorderColor"] = {"hipTexRefSetBorderColor", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuTexRefGetBorderColor"] = {"hipTexRefGetBorderColor", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + // Profiler // unsupported yet by HIP cuda2hipRename["cuProfilerInitialize"] = {"hipProfilerInitialize", CONV_OTHER, API_DRIVER, HIP_UNSUPPORTED}; @@ -1111,6 +1124,25 @@ struct cuda2hipMap { cuda2hipRename["cudaMemcpyFromArrayAsync"] = {"hipMemcpyFromArrayAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaMemcpyFromSymbol"] = {"hipMemcpyFromSymbol", CONV_MEM, API_RUNTIME}; cuda2hipRename["cudaMemcpyFromSymbolAsync"] = {"hipMemcpyFromSymbolAsync", CONV_MEM, API_RUNTIME}; + cuda2hipRename["cudaMemAdvise"] = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] + cuda2hipRename["cudaMemRangeGetAttribute"] = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] + cuda2hipRename["cudaMemRangeGetAttributes"] = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] + + // unsupported yet by HIP [CUDA 8.0.44] + // Memory advise values + cuda2hipRename["cudaMemoryAdvise"] = {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_advise) + cuda2hipRename["cudaMemAdviseSetReadMostly"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_READ_MOSTLY = 1) + cuda2hipRename["cudaMemAdviseUnsetReadMostly"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2) + cuda2hipRename["cudaMemAdviseSetPreferredLocation"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3) + cuda2hipRename["cudaMemAdviseUnsetPreferredLocation"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4) + cuda2hipRename["cudaMemAdviseSetAccessedBy"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 5 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_ACCESSED_BY = 5) + cuda2hipRename["cudaMemAdviseUnsetAccessedBy"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 6 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6) + // CUmem_range_attribute + cuda2hipRename["cudaMemRangeAttribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_range_attribute) + cuda2hipRename["cudaMemRangeAttributeReadMostly"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1) + cuda2hipRename["cudaMemRangeAttributePreferredLocation"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2) + cuda2hipRename["cudaMemRangeAttributeAccessedBy"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3) + cuda2hipRename["cudaMemRangeAttributeLastPrefetchLocation"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4) // memcpy kind cuda2hipRename["cudaMemcpyKind"] = {"hipMemcpyKind", CONV_MEM, API_RUNTIME}; @@ -1137,6 +1169,7 @@ struct cuda2hipMap { cuda2hipRename["cudaGetMipmappedArrayLevel"] = {"hipGetMipmappedArrayLevel", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaGetSymbolAddress"] = {"hipGetSymbolAddress", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaGetSymbolSize"] = {"hipGetSymbolSize", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cudaMemPrefetchAsync"] = {"hipMemPrefetchAsync", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Driver ANALOGUE (cuMemPrefetchAsync) // malloc cuda2hipRename["cudaMalloc"] = {"hipMalloc", CONV_MEM, API_RUNTIME}; @@ -1379,10 +1412,12 @@ struct cuda2hipMap { // unsupported yet by HIP [CUDA 8.0.44] // P2P Attributes - cuda2hipRename["cudaDeviceP2PAttr"] = {"hipDeviceP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_Runtime ANALOGUE (CUdevice_P2PAttribute) - cuda2hipRename["cudaDevP2PAttrPerformanceRank"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01) - cuda2hipRename["cudaDevP2PAttrAccessSupported"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02) - cuda2hipRename["cudaDevP2PAttrNativeAtomicSupported"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_Runtime ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03) + cuda2hipRename["cudaDeviceP2PAttr"] = {"hipDeviceP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUdevice_P2PAttribute) + cuda2hipRename["cudaDevP2PAttrPerformanceRank"] = {"hipDeviceP2PAttributePerformanceRank", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x01 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_PERFORMANCE_RANK = 0x01) + cuda2hipRename["cudaDevP2PAttrAccessSupported"] = {"hipDeviceP2PAttributeAccessSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x02 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_ACCESS_SUPPORTED = 0x02) + cuda2hipRename["cudaDevP2PAttrNativeAtomicSupported"] = {"hipDeviceP2PAttributeNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // 0x03 // API_DRIVER ANALOGUE (CU_DEVICE_P2P_ATTRIBUTE_NATIVE_ATOMIC_SUPPORTED = 0x03) + // [CUDA 8.0.44] + cuda2hipRename["cudaDeviceGetP2PAttribute"] = {"hipDeviceGetP2PAttribute", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (cuDeviceGetP2PAttribute) // Compute mode cuda2hipRename["cudaComputeMode"] = {"hipComputeMode", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED}; // API_DRIVER ANALOGUE (CUcomputemode) @@ -1591,22 +1626,6 @@ struct cuda2hipMap { cuda2hipRename["cudaAddressModeMirror"] = {"hipAddressModeMirror", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaAddressModeBorder"] = {"hipAddressModeBorder", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; - // unsupported yet by HIP [CUDA 8.0.44] - // Memory advise values - cuda2hipRename["cudaMemoryAdvise"] = {"hipMemAdvise", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_advise) - cuda2hipRename["cudaMemAdviseSetReadMostly"] = {"hipMemAdviseSetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_READ_MOSTLY = 1) - cuda2hipRename["cudaMemAdviseUnsetReadMostly"] = {"hipMemAdviseUnsetReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_READ_MOSTLY = 2) - cuda2hipRename["cudaMemAdviseSetPreferredLocation"] = {"hipMemAdviseSetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_PREFERRED_LOCATION = 3) - cuda2hipRename["cudaMemAdviseUnsetPreferredLocation"] = {"hipMemAdviseUnsetPreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION = 4) - cuda2hipRename["cudaMemAdviseSetAccessedBy"] = {"hipMemAdviseSetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 5 // API_Driver ANALOGUE (CU_MEM_ADVISE_SET_ACCESSED_BY = 5) - cuda2hipRename["cudaMemAdviseUnsetAccessedBy"] = {"hipMemAdviseUnsetAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 6 // API_Driver ANALOGUE (CU_MEM_ADVISE_UNSET_ACCESSED_BY = 6) - // CUmem_range_attribute - cuda2hipRename["cudaMemRangeAttribute"] = {"hipMemRangeAttribute", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // API_Driver ANALOGUE (CUmem_range_attribute) - cuda2hipRename["cudaMemRangeAttributeReadMostly"] = {"hipMemRangeAttributeReadMostly", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 1 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_READ_MOSTLY = 1) - cuda2hipRename["cudaMemRangeAttributePreferredLocation"] = {"hipMemRangeAttributePreferredLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 2 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_PREFERRED_LOCATION = 2) - cuda2hipRename["cudaMemRangeAttributeAccessedBy"] = {"hipMemRangeAttributeAccessedBy", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 3 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_ACCESSED_BY = 3) - cuda2hipRename["cudaMemRangeAttributeLastPrefetchLocation"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}; // 4 // API_Driver ANALOGUE (CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION = 4) - // functions cuda2hipRename["cudaCreateTextureObject"] = {"hipCreateTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; cuda2hipRename["cudaDestroyTextureObject"] = {"hipDestroyTextureObject", CONV_TEX, API_RUNTIME, HIP_UNSUPPORTED}; From ef444588e13bf394e69229166c9ebdef351f594e Mon Sep 17 00:00:00 2001 From: emankov Date: Fri, 2 Jun 2017 16:30:43 +0300 Subject: [PATCH 125/171] [HIPIFY] rename legacy hipify perl script and its usage to hipify-perl [ROCm/clr commit: e7779650e9353e3f25cad9d1881c9b63504fbbd2] --- projects/clr/hipamd/bin/hipconvertinplace-perl.sh | 10 +++++----- projects/clr/hipamd/bin/hipexamine-perl.sh | 6 +++--- projects/clr/hipamd/bin/{hipify => hipify-perl} | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) rename projects/clr/hipamd/bin/{hipify => hipify-perl} (99%) diff --git a/projects/clr/hipamd/bin/hipconvertinplace-perl.sh b/projects/clr/hipamd/bin/hipconvertinplace-perl.sh index a8c8d6d9e8..d500cc14c6 100755 --- a/projects/clr/hipamd/bin/hipconvertinplace-perl.sh +++ b/projects/clr/hipamd/bin/hipconvertinplace-perl.sh @@ -1,18 +1,18 @@ #!/bin/bash -#usage : hipconvertinplace.sh [DIRNAME] [HIPIFY_OPTIONS] +#usage : hipconvertinplace-perl.sh DIRNAME [hipify-perl options] -#hipify "inplace" all code files in specified directory. +#hipify "inplace" all code files in specified directory. # This can be quite handy when dealing with an existing CUDA code base since the script # preserves the existing directory structure. # For each code file, this script will: -# - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip". Then Hipify the code file. +# - If ".prehip file does not exist, copy the original code to a new file with extension ".prehip". Then hipify the code file. # - If ".prehip" file exists, this is used as input to hipify. -# (this is useful for testing improvements to the hipify toolset). +# (this is useful for testing improvements to the hipify-perl toolset). SCRIPT_DIR=`dirname $0` SEARCH_DIR=$1 shift -$SCRIPT_DIR/hipify -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` +$SCRIPT_DIR/hipify-perl -inplace -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` diff --git a/projects/clr/hipamd/bin/hipexamine-perl.sh b/projects/clr/hipamd/bin/hipexamine-perl.sh index 40c1bf466d..9e0b01df44 100755 --- a/projects/clr/hipamd/bin/hipexamine-perl.sh +++ b/projects/clr/hipamd/bin/hipexamine-perl.sh @@ -1,12 +1,12 @@ #!/bin/bash -#usage : hipexamine.sh DIRNAME [hipify.pl options] +#usage : hipexamine.sh DIRNAME [hipify-perl options] -# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files +# Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files # in the specified directory. SCRIPT_DIR=`dirname $0` SEARCH_DIR=$1 shift -$SCRIPT_DIR/hipify -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` +$SCRIPT_DIR/hipify-perl -no-output -print-stats "$@" `$SCRIPT_DIR/findcode.sh $SEARCH_DIR` diff --git a/projects/clr/hipamd/bin/hipify b/projects/clr/hipamd/bin/hipify-perl similarity index 99% rename from projects/clr/hipamd/bin/hipify rename to projects/clr/hipamd/bin/hipify-perl index 4d77fad3ed..27acc5bccc 100755 --- a/projects/clr/hipamd/bin/hipify +++ b/projects/clr/hipamd/bin/hipify-perl @@ -20,7 +20,7 @@ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. ## -#usage hipify [OPTIONS] INPUT_FILE +#usage hipify-perl [OPTIONS] INPUT_FILE use Getopt::Long; my $warn_whitelist =""; @@ -201,7 +201,7 @@ while (@ARGV) { my %ft; clearStats(\%ft, \@statNames); my $countIncludes = 0; - my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify but counted here. + my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify-perl, but counted here. my $warnings = 0; my $warningsCublas = 0; my $warningsCurand = 0; From 6235e4bc7fda98991c80555b590dc7b7619cf971 Mon Sep 17 00:00:00 2001 From: emankov Date: Fri, 2 Jun 2017 16:33:48 +0300 Subject: [PATCH 126/171] [HIPIFY] annotation [ROCm/clr commit: c5f9758f4b67bee0a624780f813b9bb5e0d80c3c] --- projects/clr/hipamd/bin/hipexamine-perl.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/bin/hipexamine-perl.sh b/projects/clr/hipamd/bin/hipexamine-perl.sh index 9e0b01df44..4e3a261aa4 100755 --- a/projects/clr/hipamd/bin/hipexamine-perl.sh +++ b/projects/clr/hipamd/bin/hipexamine-perl.sh @@ -1,6 +1,6 @@ #!/bin/bash -#usage : hipexamine.sh DIRNAME [hipify-perl options] +#usage : hipexamine-perl.sh DIRNAME [hipify-perl options] # Generate HIP stats (LOC, CUDA->API conversions, missing functionality) for all the code files # in the specified directory. From 97fa7aeef6d7806ed282ddec832a569e61cd3966 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 2 Jun 2017 11:19:33 -0500 Subject: [PATCH 127/171] added half data type and vector destructors 1. Added half data types to hip_fp16.h 2. Added destructor to vector data types Change-Id: Id5ae76a663bb90a4bde2839ec79c58fbaee5072f [ROCm/clr commit: fdcc2238421fc0bb29f170ff4e2106999ad3bb9d] --- .../hipamd/include/hip/hcc_detail/hip_fp16.h | 1 + .../include/hip/hcc_detail/hip_vector_types.h | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h index a1abce2191..b1ecc61cb0 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h @@ -28,6 +28,7 @@ THE SOFTWARE. typedef __fp16 __half; typedef __fp16 __half1 __attribute__((ext_vector_type(1))); typedef __fp16 __half2 __attribute__((ext_vector_type(2))); +typedef __fp16 half; /* Half Arithmetic Functions diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h index 3c3b26c12a..9da34d9f32 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h @@ -37,38 +37,41 @@ THE SOFTWARE. #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x) { } \ -__device__ __host__ type(const type& val) : x(val.x) { } +__device__ __host__ type(const type& val) : x(val.x) { } \ +__device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } +__device__ __host__ type(const type& val) : x(val.x), y(val.y) { } \ +__device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } \ +__device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ __device__ __host__ type() {} \ __device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ -__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } - +__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ +__device__ __host__ ~type() {} #define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val) {} \ #define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val), y(val) {} \ -__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} +__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {} \ #define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} +__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {} \ #define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \ __device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \ -__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} +__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {} \ struct uchar1 { #ifdef __cplusplus From 6aaeed821d148e5cc56efe2f823af634400dcc04 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sat, 3 Jun 2017 17:09:19 -0500 Subject: [PATCH 128/171] Update tests, add p2p coherency test. [ROCm/clr commit: 15f54fb9439118381bd30948e75ecbd12b4a79c4] --- projects/clr/hipamd/src/hip_hcc.cpp | 4 +- .../src/runtimeApi/memory/hipHostMalloc.cpp | 17 ++ .../runtimeApi/memory/p2p_copy_coherency.cpp | 170 ++++++++++++++++++ 3 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 4400e4596e..5d8846da1e 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -84,6 +84,8 @@ int HIP_DENY_PEER_ACCESS = 0; // Force async copies to actually use the synchronous copy interface. int HIP_FORCE_SYNC_COPY = 0; +// TODO - set these to 0 and 1 +int HIP_EVENT_SYS_RELEASE=1; int HIP_COHERENT_HOST_ALLOC = 0; // TODO - set to 0 once we resolve stability. @@ -94,9 +96,9 @@ int HIP_SYNC_HOST_ALLOC = 1; int HIP_SYNC_NULL_STREAM = 1; // HIP needs to change some behavior based on HCC_OPT_FLUSH : +// TODO - set this to 1 int HCC_OPT_FLUSH = 0; -int HIP_EVENT_SYS_RELEASE=0; diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 54073e4901..47baf5c206 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -52,6 +52,8 @@ std::vector syncMsg = {"event", "stream", "device"}; void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg) { std::cerr << "test: CheckHostPointer " << msg + //<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC + //<< " HIP_EVENT_SYS_RELEASE=" << HIP_EVENT_SYS_RELEASE << " eventFlags = " << std::hex << eventFlags << ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "") << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") @@ -185,6 +187,21 @@ int main(){ } + // Check defaults: + if (1) { + int *A = nullptr; + HIPCHECK(hipHostMalloc((void**)&A, sizeBytes)); + const char *ptrType = "default"; + CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType); + + CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType); + CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType); + } + + } diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp new file mode 100644 index 0000000000..459c0054c9 --- /dev/null +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -0,0 +1,170 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +// Simple test for memset. +// Also serves as a template for other tests. + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * RUN: %t + * HIT_END + */ + +#include "hip/hip_runtime.h" +#include "test_common.h" + +#ifdef __HIP_PLATFORM_HCC__ +#include +#endif + +#define USE_HSA_COPY 1 + +int enablePeers(int dev0, int dev1) +{ + int canAccessPeer01, canAccessPeer10; + HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer01, dev0, dev1)); + HIPCHECK(hipDeviceCanAccessPeer(&canAccessPeer10, dev1, dev0)); + if (!canAccessPeer01 || !canAccessPeer10) { + return -1; + } + + HIPCHECK(hipSetDevice(dev0)); + HIPCHECK(hipDeviceEnablePeerAccess(dev1, 0/*flags*/)); + HIPCHECK(hipSetDevice(dev1)); + HIPCHECK(hipDeviceEnablePeerAccess(dev0, 0/*flags*/)); + + return 0; +}; + + +__global__ void +memsetIntKernel(int * ptr, int val, size_t numElements) +{ + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + if (gid < numElements) { + ptr[gid] = val; + } +}; + + +void checkReverse(const int *ptr, int numElements, int expected) { + for (int i=numElements-1; i>=0; i--) { + if (ptr[i] != expected) { + printf ("i=%d, ptr[](%d) != expected (%d)\n", i, ptr[i], expected); + assert (ptr[i] == expected); + } + } + + printf ("test: OK\n"); +} + + +void runTest(bool stepAIsCopy, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, + int * dataGpu0, int *dataGpu1, int *dataHost, int expected) +{ + hipEvent_t e; + HIPCHECK(hipEventCreateWithFlags(&e,0)); + + printf ("test: runTest with %s\n", stepAIsCopy ? "copy" : "kernel"); + const size_t sizeElements = numElements * sizeof(int); + + hipStream_t stepAStream = gpu0Stream; + + if (stepAIsCopy) { +#ifdef USE_HSA_COPY + HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0, sizeElements, hipMemcpyDeviceToDevice, stepAStream)); +#endif + } else { + assert(0); // not yet supported. + } + + HIPCHECK(hipEventRecord(e, stepAStream)); + HIPCHECK(hipStreamWaitEvent(gpu1Stream, e, 0)); + + HIPCHECK(hipMemcpyAsync(dataHost, dataGpu1, sizeElements, hipMemcpyDeviceToHost, gpu1Stream)); + + HIPCHECK(hipStreamSynchronize(gpu1Stream)); + + checkReverse(dataHost, numElements, expected); +} + + +void testMultiGpu0(int dev0, int dev1, int numElements) +{ + const size_t sizeElements = numElements * sizeof(int); + + int * dataGpu0, *dataGpu1, *dataHost; + hipStream_t gpu0Stream, gpu1Stream; + const int expected = 42; + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + + HIPCHECK(hipSetDevice(dev0)); + + HIPCHECK(hipMalloc(&dataGpu0, sizeElements)); + HIPCHECK(hipStreamCreate(&gpu0Stream)); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, + dataGpu0, expected, numElements); + HIPCHECK(hipDeviceSynchronize()); + + + HIPCHECK(hipSetDevice(dev1)); + HIPCHECK(hipMalloc(&dataGpu1, sizeElements)); + HIPCHECK(hipStreamCreate(&gpu1Stream)); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, + dataGpu1, 0x34, numElements); + HIPCHECK(hipDeviceSynchronize()); + + HIPCHECK(hipHostMalloc(&dataHost, sizeElements)); + memset(dataHost, 13, sizeElements); + +#ifdef __HIP_PLATFORM_HCC__ + hc::am_memtracker_print(0x0); +#endif + + printf (" test: init complete\n"); + + runTest(true/*stepAIsCopy*/, gpu0Stream, gpu1Stream, numElements, dataGpu0, dataGpu1, dataHost, expected); + +}; + + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true); + + int numElements = N; + + int dev0 = 0; + int dev1 = 1; + + // TODO - only works on multi-GPU system: + if (enablePeers(dev0,dev1) == -1) { + printf ("warning : could not find peer gpus\n"); + return -1; + }; + + //testMultiGpu0(dev0, dev1, numElements); + + + + passed(); +}; From be21cd1a919342de02ced4e7eb1666825a46f6dd Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Sun, 4 Jun 2017 20:18:37 -0500 Subject: [PATCH 129/171] Update tests. Fix some NVCC issues. Add hipStreamSync2, record_event tests. [ROCm/clr commit: 863b7c3f5688059f82c4ade18980a3567d2328dd] --- .../tests/src/kernel/inline_asm_vadd.cpp | 2 +- .../src/runtimeApi/event/record_event.cpp | 149 +++++++++++++++ .../src/runtimeApi/memory/hipHostMalloc.cpp | 3 +- .../runtimeApi/memory/hipMemoryAllocate.cpp | 132 -------------- .../src/runtimeApi/stream/hipStreamSync2.cpp | 169 ++++++++++++++++++ projects/clr/hipamd/tests/src/test_common.h | 14 ++ 6 files changed, 335 insertions(+), 134 deletions(-) create mode 100644 projects/clr/hipamd/tests/src/runtimeApi/event/record_event.cpp delete mode 100644 projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp create mode 100644 projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp diff --git a/projects/clr/hipamd/tests/src/kernel/inline_asm_vadd.cpp b/projects/clr/hipamd/tests/src/kernel/inline_asm_vadd.cpp index 481b606e89..7a941d31af 100644 --- a/projects/clr/hipamd/tests/src/kernel/inline_asm_vadd.cpp +++ b/projects/clr/hipamd/tests/src/kernel/inline_asm_vadd.cpp @@ -16,7 +16,7 @@ IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTI THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /* HIT_START - * BUILD: %t %s + * BUILD: %t %s EXCLUDE_HIP_PLATFORM nvcc * RUN: %t * HIT_END */ diff --git a/projects/clr/hipamd/tests/src/runtimeApi/event/record_event.cpp b/projects/clr/hipamd/tests/src/runtimeApi/event/record_event.cpp new file mode 100644 index 0000000000..66027b1643 --- /dev/null +++ b/projects/clr/hipamd/tests/src/runtimeApi/event/record_event.cpp @@ -0,0 +1,149 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + + +#include "test_common.h" + +enum SyncMode { + syncNone, + syncNullStream, + syncOtherStream, +}; + + +const char *syncModeString(int syncMode) { + switch (syncMode) { + case syncNone: + return "syncNone"; + case syncNullStream: + return "syncNullStream"; + case syncOtherStream: + return "syncOtherStream"; + default: + return "unknown"; + }; +}; + + +void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode) +{ + printf ("\ntest: syncMode=%s\n", syncModeString(syncMode)); + + size_t sizeBytes = numElements * sizeof(int); + + int count =100; + int init0 = 0; + HIPCHECK(hipMemset(C_d, init0, sizeBytes)); + for (int i=0; i0.0f); + printf ("time=%6.2f\n", t); + + HIPCHECK(hipEventElapsedTime(&t, stop, start)); + assert (t<0.0f); + printf ("negtime=%6.2f\n", t); + + HIPCHECK(hipEventElapsedTime(&t, start, start)); + assert (t==0.0f); + HIPCHECK(hipEventElapsedTime(&t, stop, stop)); + assert (t==0.0f); + + + if (stream) { + HIPCHECK(hipStreamDestroy(stream)); + } + HIPCHECK(hipEventDestroy(start)); + HIPCHECK(hipEventDestroy(stop)); + + printf ("test: OK \n"); +} + + + +void runTests(int64_t numElements) +{ + size_t sizeBytes = numElements * sizeof(int); + + printf ("test: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0); + + + int *C_h, *C_d; + HIPCHECK(hipMalloc(&C_d, sizeBytes)); + HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); + + + { + test (C_d, C_h, numElements, syncNone); + test (C_d, C_h, numElements, syncNullStream); + test (C_d, C_h, numElements, syncOtherStream); + //test (C_d, C_h, numElements, syncDevice); + } + + + HIPCHECK(hipFree(C_d)); + HIPCHECK(hipHostFree(C_h)); +} + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); + + runTests(4000000); + + passed(); +} diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 47baf5c206..607e2a9f63 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -21,11 +21,12 @@ */ /* HIT_START - * BUILD: %t %s ../../test_common.cpp + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS -std=c++11 * RUN: %t * HIT_END */ +#include #include"test_common.h" #define LEN 1024*1024 diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp deleted file mode 100644 index 34951f0a09..0000000000 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocate.cpp +++ /dev/null @@ -1,132 +0,0 @@ -/* -Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR -IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* HIT_START - * BUILD: %t %s ../../test_common.cpp - * RUN: %t - * HIT_END - */ - -#include"test_common.h" - -#define NUM_ELEMENTS 1024*1024*64 -#define SIZE NUM_ELEMENTS*sizeof(int) - -int p_count = 4; - - -void multiGpuHostAlloc(int allocDevice) -{ - - int numDevices; - HIPCHECK(hipGetDeviceCount(&numDevices)); - - printf ("info: trying multiGpuHostAlloc with allocDevice=%d numDevices=%d\n", allocDevice, numDevices); - - - HIPCHECK(hipSetDevice(allocDevice)); - - int *Ah, *Ch; - hipHostMalloc((void**)&Ah, SIZE); - hipHostMalloc((void**)&Ch, SIZE); - - const int init = -1; - for (size_t i=0; i 1) - { - multiGpuHostAlloc(1); - } - } - - passed(); -} diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp new file mode 100644 index 0000000000..b57e120dcc --- /dev/null +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -0,0 +1,169 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + + +#include "test_common.h" + +enum SyncMode { + syncNone, + syncNullStream, + syncOtherStream, + syncMarkerThenOtherStream, + syncMarkerThenOtherNonBlockingStream, + syncDevice +}; + + +const char *syncModeString(int syncMode) { + switch (syncMode) { + case syncNone: + return "syncNone"; + case syncNullStream: + return "syncNullStream"; + case syncOtherStream: + return "syncOtherStream"; + case syncMarkerThenOtherStream: + return "syncMarkerThenOtherStream"; + case syncMarkerThenOtherNonBlockingStream: + return "syncMarkerThenOtherNonBlockingStream"; + case syncDevice: + return "syncDevice"; + default: + return "unknown"; + }; +}; + + +void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) +{ + printf ("\ntest: syncMode=%s expectMismatch=%d\n", syncModeString(syncMode), expectMismatch); + + size_t sizeBytes = numElements * sizeof(int); + + int count =100; + int init0 = 0; + HIPCHECK(hipMemset(C_d, init0, sizeBytes)); + for (int i=0; i 0); + } + + + HIPCHECK(hipStreamDestroy(otherStream)); + HIPCHECK(hipEventDestroy(e)); + + printf ("test: OK - %d mismatches (%6.2f%%)\n", mismatches, ((double)(mismatches)*100.0)/numElements); +} + + +void testEventRecord() +{ +} + + +void runTests(int64_t numElements) +{ + size_t sizeBytes = numElements * sizeof(int); + + printf ("\n\ntest: starting sequence with sizeBytes=%zu bytes, %6.2f MB\n", sizeBytes, sizeBytes/1024.0/1024.0); + + + int *C_h, *C_d; + HIPCHECK(hipMalloc(&C_d, sizeBytes)); + HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); + + + { + test (C_d, C_h, numElements, syncNone, true /*expectMismatch*/); + test (C_d, C_h, numElements, syncNullStream, false /*expectMismatch*/); + test (C_d, C_h, numElements, syncOtherStream, true /*expectMismatch*/); + test (C_d, C_h, numElements, syncDevice, false /*expectMismatch*/); + test (C_d, C_h, numElements, syncMarkerThenOtherStream, false /*expectMismatch*/); + test (C_d, C_h, numElements, syncMarkerThenOtherNonBlockingStream, true /*expectMismatch*/); + } + + + HIPCHECK(hipFree(C_d)); + HIPCHECK(hipHostFree(C_h)); +} + + +int main(int argc, char *argv[]) +{ + HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); + + runTests(40000000); + + passed(); +} diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index 81edca4e1e..f585fb8bca 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -201,6 +201,20 @@ addCountReverse( const T *A_d, } +template +__global__ void +memsetReverse( T *C_d, T val, + int64_t NELEM) +{ + size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + size_t stride = hipBlockDim_x * hipGridDim_x ; + + for (int64_t i=NELEM-stride+offset; i>=0; i-=stride) { + C_d[i] = val; + } +} + + template void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) { From 7237ed04f324f8a1a0f637a4c56cd1a4d1934fcf Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Mon, 5 Jun 2017 00:41:18 -0500 Subject: [PATCH 130/171] Fix HIP_SYNC_NULL_STREAM=0 mode. - Fix null-stream sync - hipStreamDestroy of null stream returns hipErrorInvalidResourceHandle - Update documentation. - Add tests for null stream sync, hipEventElapsedTime. - Rename internal enum hipEventStatusRecorded to hipEventStatusComplete - refactor hipStreamWaitEvent to streamline control-flow [ROCm/clr commit: 823281dcbae38bb74d033078aa0f8da256367bff] --- .../include/hip/hcc_detail/hip_runtime_api.h | 10 +- projects/clr/hipamd/src/hip_event.cpp | 74 ++++----- projects/clr/hipamd/src/hip_hcc.cpp | 56 ++++--- projects/clr/hipamd/src/hip_hcc_internal.h | 10 +- projects/clr/hipamd/src/hip_stream.cpp | 30 ++-- .../src/runtimeApi/event/record_event.cpp | 141 ++++++++++++------ .../src/runtimeApi/stream/hipStreamSync2.cpp | 72 ++++++--- 7 files changed, 249 insertions(+), 144 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index 25eac31ec6..fde38c8395 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -658,10 +658,12 @@ hipError_t hipStreamSynchronize(hipStream_t stream); * * This function inserts a wait operation into the specified stream. * All future work submitted to @p stream will wait until @p event reports completion before beginning execution. - * This function is host-asynchronous and the function may return before the wait has completed. + * + * This function only waits for commands in the current stream to complete. Notably,, this function does + * not impliciy wait for commands in the default stream to complete, even if the specified stream is + * created with hipStreamNonBlocking = 0. * * @see hipStreamCreate, hipStreamCreateWithFlags, hipStreamSynchronize, hipStreamDestroy - * */ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int flags); @@ -766,10 +768,10 @@ hipError_t hipEventCreate(hipEvent_t* event); * the specified stream, after all previous * commands in that stream have completed executing. * - * If hipEventRecord() has been previously called aon event, then this call will overwrite any existing state in event. + * If hipEventRecord() has been previously called on this event, then this call will overwrite any existing state in event. * * If this function is called on a an event that is currently being recorded, results are undefined - either - * outstanding recording may save state into the event, and the order is not guaranteed. This shoul be avoided. + * outstanding recording may save state into the event, and the order is not guaranteed. * * @see hipEventCreate, hipEventCreateWithFlags, hipEventQuery, hipEventSynchronize, hipEventDestroy, hipEventElapsedTime * diff --git a/projects/clr/hipamd/src/hip_event.cpp b/projects/clr/hipamd/src/hip_event.cpp index 8ef652489a..ab1c43a00b 100644 --- a/projects/clr/hipamd/src/hip_event.cpp +++ b/projects/clr/hipamd/src/hip_event.cpp @@ -53,15 +53,12 @@ void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, -void ihipEvent_t::setTimestamp() +void ihipEvent_t::refereshEventStatus() { bool isReady0 = _marker.is_ready(); bool isReady1; int val = 0; - if (_state == hipEventStatusRecorded) { - // already recorded, done: - return; - } else { + if (_state == hipEventStatusRecording) { // TODO - use completion-future functions to obtain ticks and timestamps: hsa_signal_t *sig = static_cast (_marker.get_native_handle()); isReady1 = _marker.is_ready(); @@ -78,12 +75,12 @@ void ihipEvent_t::setTimestamp() _timestamp = 0; } - _state = hipEventStatusRecorded; + _state = hipEventStatusComplete; } } } - if (_state != hipEventStatusRecorded) { + if (_state != hipEventStatusComplete) { //printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); } } @@ -103,12 +100,10 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) const unsigned releaseFlags = (hipEventReleaseToDevice | hipEventReleaseToSystem); const bool illegalFlags = (flags & ~supportedFlags) || // can't set any unsupported flags. - (flags & releaseFlags) == releaseFlags; // can't set both + (flags & releaseFlags) == releaseFlags; // can't set both release flags if (!illegalFlags) { - ihipEvent_t *eh = new ihipEvent_t(flags); - - *event = eh; + *event = new ihipEvent_t(flags); } else { e = hipErrorInvalidValue; } @@ -148,7 +143,7 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) ctx->locked_syncDefaultStream(true, true); event->_timestamp = hc::get_system_ticks(); - event->_state = hipEventStatusRecorded; + event->_state = hipEventStatusComplete; return ihipLogStatus(hipSuccess); } else { event->_state = hipEventStatusRecording; @@ -209,41 +204,50 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) { HIP_INIT_API(ms, start, stop); - start->setTimestamp(); - stop->setTimestamp(); - hipError_t status = hipSuccess; + *ms = 0.0f; - if (start && stop) { - // refresh status: - if ((start->_state == hipEventStatusRecorded) && (stop->_state == hipEventStatusRecorded)) { - // Common case, we have good information for both events. + if ((start == nullptr) || + (start->_flags & hipEventDisableTiming) || + (start->_state == hipEventStatusUnitialized) || (start->_state == hipEventStatusCreated) || + (stop == nullptr) || + (stop->_flags & hipEventDisableTiming) || + ( stop->_state == hipEventStatusUnitialized) || ( stop->_state == hipEventStatusCreated)) { - int64_t tickDiff = (stop->timestamp() - start->timestamp()); + // Both events must be at least recorded else return hipErrorInvalidResourceHandle - uint64_t freqHz; - hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); - if (freqHz) { - *ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f; - status = hipSuccess; - } else { - * ms = 0.0f; - status = hipErrorInvalidValue; - } + status = hipErrorInvalidResourceHandle; + + } else { + // Refresh status, if still recording... + start->refereshEventStatus(); + stop->refereshEventStatus(); + + if ((start->_state == hipEventStatusComplete) && (stop->_state == hipEventStatusComplete)) { + // Common case, we have good information for both events. + + int64_t tickDiff = (stop->timestamp() - start->timestamp()); + + uint64_t freqHz; + hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); + if (freqHz) { + *ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f; + status = hipSuccess; + } else { + * ms = 0.0f; + status = hipErrorInvalidValue; + } } else if ((start->_state == hipEventStatusRecording) || (stop->_state == hipEventStatusRecording)) { status = hipErrorNotReady; - } else if ((start->_state == hipEventStatusUnitialized) || - (stop->_state == hipEventStatusUnitialized)) { - status = hipErrorInvalidResourceHandle; + } else { + assert(0); } - } else { - status = hipErrorInvalidResourceHandle; - } + } return ihipLogStatus(status); } diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 5d8846da1e..0cdc57eaab 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -92,7 +92,8 @@ int HIP_COHERENT_HOST_ALLOC = 0; // USE_ HIP_SYNC_HOST_ALLOC int HIP_SYNC_HOST_ALLOC = 1; -// Sync on host between +// Chicken bit to sync on host to implement null stream. +// If 0, null stream synchronization is performed on the GPU int HIP_SYNC_NULL_STREAM = 1; // HIP needs to change some behavior based on HCC_OPT_FLUSH : @@ -987,11 +988,17 @@ std::string ihipCtx_t::toString() const -// Implement "default" stream syncronization -// This waits for all other streams to drain before continuing. +// This called for submissions that are sent to the null/default stream. This routine ensures +// that this new command waits for activity in the other streams to complete before proceeding. +// +// HIP_SYNC_NULL_STREAM=0 does all dependency resolutiokn on the GPU +// HIP_SYNC_NULL_STREAM=1 s legacy non-optimal mode which conservatively waits on host. +// // If waitOnSelf is set, this additionally waits for the default stream to empty. // In new HIP_SYNC_NULL_STREAM=0 mode, this enqueues a marker which causes the default stream to wait for other // activity, but doesn't actually block the host. If host blocking is desired, the caller should set syncHost. +// +// syncToHost causes host to wait for the stream to finish. // Note HIP_SYNC_NULL_STREAM=1 path always sync to Host. void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) { @@ -1005,34 +1012,36 @@ void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf, bool syncHost) for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) { ihipStream_t *stream = *streamI; + // Don't wait for streams that have "opted-out" of syncing with NULL stream. + // And - don't wait for the NULL stream, unless waitOnSelf specified. + bool waitThisStream = (!(stream->_flags & hipStreamNonBlocking)) && + (waitOnSelf || (stream != _defaultStream)); + if (HIP_SYNC_NULL_STREAM) { - // Don't wait for streams that have "opted-out" of syncing with NULL stream. - // And - don't wait for the NULL stream - if (!(stream->_flags & hipStreamNonBlocking)) { - - if (waitOnSelf || (stream != _defaultStream)) { - stream->locked_wait(); - } + if (waitThisStream) { + stream->locked_wait(); } } else { - if (!(stream->_flags & hipStreamNonBlocking) && (stream != _defaultStream)) { + if (waitThisStream) { LockedAccessor_StreamCrit_t streamCrit(stream->_criticalData); // The last marker will provide appropriate visibility: if (!streamCrit->_av.get_is_empty()) { depOps.push_back(streamCrit->_av.create_marker(hc::accelerator_scope)); + tprintf(DB_SYNC, " push marker to wait for stream=%s\n", ToString(stream).c_str()); + } else { + tprintf(DB_SYNC, " skipped stream=%s since it is empty\n", ToString(stream).c_str()); } } } } - // Enqueue a barrier to wait on all the barriers we sent above: if (!HIP_SYNC_NULL_STREAM && !depOps.empty()) { LockedAccessor_StreamCrit_t defaultStreamCrit(_defaultStream->_criticalData); - tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams\n", depOps.size()); + tprintf(DB_SYNC, " null-stream wait on %zu non-empty streams. sync_host=%d\n", depOps.size(), syncHost); hc::completion_future defaultCf = defaultStreamCrit->_av.create_blocking_marker(depOps.begin(), depOps.end(), hc::accelerator_scope); if (syncHost) { defaultCf.wait(); // TODO - account for active or blocking here. @@ -1374,6 +1383,7 @@ void ihipInit() hipStream_t ihipSyncAndResolveStream(hipStream_t stream) { if (stream == hipStreamNull ) { + // Submitting to NULL stream, call locked_syncDefaultStream to wait for all other streams: ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); @@ -1382,34 +1392,38 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream) #endif return ctx->_defaultStream; } else { - // All streams have to wait for legacy default stream to be empty: + // Submitting to a "normal" stream, just wait for null stream: if (!(stream->_flags & hipStreamNonBlocking)) { if (HIP_SYNC_NULL_STREAM) { - tprintf(DB_SYNC, "ihipSyncAndResolveStream %s wait on default stream\n", ToString(stream).c_str()); + tprintf(DB_SYNC, "ihipSyncAndResolveStream %s host-wait on default stream\n", ToString(stream).c_str()); stream->getCtx()->_defaultStream->locked_wait(); } else { ihipStream_t *defaultStream = stream->getCtx()->_defaultStream; - tprintf(DB_SYNC, "%s marker wait default stream\n", ToString(stream).c_str()); - bool needMarker = false; + bool needGatherMarker = false; // used to gather together other markers. hc::completion_future dcf; { LockedAccessor_StreamCrit_t defaultStreamCrit(defaultStream->criticalData()); - // TODO - could call create_blocking_marker(queue) + // TODO - could call create_blocking_marker(queue) or uses existing marker. if (!defaultStreamCrit->_av.get_is_empty()) { - needMarker = true; + needGatherMarker = true; - // TODO - add "none_scope". + tprintf(DB_SYNC, " %s adding marker to default %s for dependency\n", + ToString(stream).c_str(), ToString(defaultStream).c_str()); dcf = defaultStreamCrit->_av.create_marker(hc::accelerator_scope); + } else { + tprintf(DB_SYNC, " %s skipping marker since default stream is empty\n", ToString(stream).c_str()); } } - if (needMarker) { + if (needGatherMarker) { // ensure any commands sent to this stream wait on the NULL stream before continuing LockedAccessor_StreamCrit_t thisStreamCrit(stream->criticalData()); // TODO - could be "noret" version of create_blocking_marker thisStreamCrit->_av.create_blocking_marker(dcf, hc::accelerator_scope); + tprintf(DB_SYNC, " %s adding marker to wait for freshly recorded default-stream marker \n", + ToString(stream).c_str()); } } } diff --git a/projects/clr/hipamd/src/hip_hcc_internal.h b/projects/clr/hipamd/src/hip_hcc_internal.h index b15d5a73e4..c3f8b72311 100644 --- a/projects/clr/hipamd/src/hip_hcc_internal.h +++ b/projects/clr/hipamd/src/hip_hcc_internal.h @@ -586,10 +586,10 @@ private: // Data //---- // Internal event structure: enum hipEventStatus_t { - hipEventStatusUnitialized = 0, // event is unutilized, must be "Created" before use. - hipEventStatusCreated = 1, - hipEventStatusRecording = 2, // event has been enqueued to record something. - hipEventStatusRecorded = 3, // event has been recorded - timestamps are valid. + hipEventStatusUnitialized = 0, // event is uninitialized, must be "Created" before use. + hipEventStatusCreated = 1, // event created, but not yet Recorded + hipEventStatusRecording = 2, // event has been recorded into a stream but not completed yet. + hipEventStatusComplete = 3, // event has been recorded - timestamps are valid. } ; // TODO - rename to ihip type of some kind @@ -604,7 +604,7 @@ class ihipEvent_t { public: ihipEvent_t(unsigned flags); void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType); - void setTimestamp(); + void refereshEventStatus(); uint64_t timestamp() const { return _timestamp; } ; ihipEventType_t type() const { return _type; }; diff --git a/projects/clr/hipamd/src/hip_stream.cpp b/projects/clr/hipamd/src/hip_stream.cpp index 9f1228d6f7..40aade28b9 100644 --- a/projects/clr/hipamd/src/hip_stream.cpp +++ b/projects/clr/hipamd/src/hip_stream.cpp @@ -93,20 +93,17 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int } else if (event->_state != hipEventStatusUnitialized) { - bool fastWait = false; - if (stream != hipStreamNull) { + + // This will user create_blocking_marker to wait on the specified queue. stream->locked_waitEvent(event); - fastWait = true; // don't use the slow host-side synchronization. - } - - if (!fastWait) { + } else { // TODO-hcc Convert to use create_blocking_marker(...) functionality. // Currently we have a super-conservative version of this - block on host, and drain the queue. // This should create a barrier packet in the target queue. + // TODO-HIP_SYNC_NULL_STREAM stream->locked_wait(); - e = hipSuccess; } } // else event not recorded, return immediately and don't create marker. @@ -150,6 +147,7 @@ hipError_t hipStreamSynchronize(hipStream_t stream) ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true/*syncToHost*/); } else { + // note this does not synchornize with the NULL stream: stream->locked_wait(); e = hipSuccess; } @@ -171,20 +169,18 @@ hipError_t hipStreamDestroy(hipStream_t stream) //--- Drain the stream: if (stream == NULL) { - ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); - ctx->locked_syncDefaultStream(true/*waitOnSelf*/, true /*syncToHost*/); + e = hipErrorInvalidResourceHandle; // TODO - review - what happens if try to destroy null stream } else { stream->locked_wait(); - e = hipSuccess; - } - ihipCtx_t *ctx = stream->getCtx(); + ihipCtx_t *ctx = stream->getCtx(); - if (ctx) { - ctx->locked_removeStream(stream); - delete stream; - } else { - e = hipErrorInvalidResourceHandle; + if (ctx) { + ctx->locked_removeStream(stream); + delete stream; + } else { + e = hipErrorInvalidResourceHandle; + } } return ihipLogStatus(e); diff --git a/projects/clr/hipamd/tests/src/runtimeApi/event/record_event.cpp b/projects/clr/hipamd/tests/src/runtimeApi/event/record_event.cpp index 66027b1643..bd8a3ada8e 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/event/record_event.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/event/record_event.cpp @@ -28,8 +28,8 @@ THE SOFTWARE. enum SyncMode { syncNone, - syncNullStream, - syncOtherStream, + syncStream, + syncStopEvent, }; @@ -37,19 +37,23 @@ const char *syncModeString(int syncMode) { switch (syncMode) { case syncNone: return "syncNone"; - case syncNullStream: - return "syncNullStream"; - case syncOtherStream: - return "syncOtherStream"; + case syncStream: + return "syncStream"; + case syncStopEvent: + return "syncStopEvent"; default: return "unknown"; }; }; -void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode) +void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_t stream, int waitStart, SyncMode syncMode) { - printf ("\ntest: syncMode=%s\n", syncModeString(syncMode)); + if (!(testMask & p_tests)) { + return; + } + printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n", + testMask, stream, waitStart, syncModeString(syncMode)); size_t sizeBytes = numElements * sizeof(int); @@ -60,55 +64,95 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode) C_h[i] = -1; // initialize } - hipStream_t stream = 0; + hipEvent_t neverCreated=0, neverRecorded, timingDisabled; + HIPCHECK(hipEventCreate(&neverRecorded)); + HIPCHECK(hipEventCreateWithFlags(&timingDisabled, hipEventDisableTiming)); - unsigned flags=0; - if (syncMode == syncOtherStream) { - HIPCHECK(hipStreamCreateWithFlags(&stream, flags)); - } - - hipEvent_t neverCreated=0; - hipEvent_t start, stop, neverRecorded; + hipEvent_t start, stop; HIPCHECK(hipEventCreate(&start)); HIPCHECK(hipEventCreate(&stop)); - HIPCHECK(hipEventCreate(&neverRecorded)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + HIPCHECK(hipEventRecord(timingDisabled, stream)); // sandwhich a kernel: HIPCHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, stream, C_d, C_h, numElements, count); HIPCHECK(hipEventRecord(stop, stream)); - HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish... + + if (waitStart) { + HIPCHECK(hipEventSynchronize(start)); + } + + + hipError_t expectedStopError = hipSuccess; + + // How to wait for the events to finish: + switch (syncMode) { + case syncNone: + expectedStopError = hipErrorNotReady; + break; + case syncStream: + HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish... + break; + case syncStopEvent: + HIPCHECK(hipEventSynchronize(stop)); + break; + default: + assert(0); + }; + float t; - HIPCHECK_API(hipEventElapsedTime(&t, neverCreated, stop), hipErrorInvalidResourceHandle); - HIPCHECK_API(hipEventElapsedTime(&t, start, neverCreated), hipErrorInvalidResourceHandle); - HIPCHECK_API(hipEventElapsedTime(&t, neverRecorded, stop), hipErrorInvalidResourceHandle); - HIPCHECK_API(hipEventElapsedTime(&t, start, neverRecorded), hipErrorInvalidResourceHandle); - - HIPCHECK(hipEventElapsedTime(&t, start, stop)); - assert (t>0.0f); - printf ("time=%6.2f\n", t); - - HIPCHECK(hipEventElapsedTime(&t, stop, start)); - assert (t<0.0f); - printf ("negtime=%6.2f\n", t); - - HIPCHECK(hipEventElapsedTime(&t, start, start)); - assert (t==0.0f); - HIPCHECK(hipEventElapsedTime(&t, stop, stop)); - assert (t==0.0f); - - - if (stream) { - HIPCHECK(hipStreamDestroy(stream)); + hipError_t e = hipEventElapsedTime(&t, start, start); + if ((e != hipSuccess) && (e != hipErrorNotReady)) { + failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e)); } + + if (e == hipSuccess) + assert (t==0.0f); + + + // stop usually ready unless we skipped the synchronization (syncNone) + HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError); + if (e == hipSuccess) + assert (t==0.0f); + + + e = hipEventElapsedTime(&t, start, stop); + HIPCHECK_API(e, expectedStopError); + if (expectedStopError == hipSuccess) + assert (t>0.0f); + printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e)); + + e = hipEventElapsedTime(&t, stop, start); + HIPCHECK_API(e, expectedStopError); + if (expectedStopError == hipSuccess) + assert (t<0.0f); + printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e)); + + + + { + // Check some error conditions for incomplete events: + HIPCHECK_API(hipEventElapsedTime(&t, timingDisabled, stop), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipEventElapsedTime(&t, start, timingDisabled), hipErrorInvalidResourceHandle); + + HIPCHECK_API(hipEventElapsedTime(&t, neverCreated, stop), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipEventElapsedTime(&t, start, neverCreated), hipErrorInvalidResourceHandle); + + HIPCHECK_API(hipEventElapsedTime(&t, neverRecorded, stop), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipEventElapsedTime(&t, start, neverRecorded), hipErrorInvalidResourceHandle); + } + HIPCHECK(hipEventDestroy(start)); HIPCHECK(hipEventDestroy(stop)); + // Clear out everything: + HIPCHECK(hipDeviceSynchronize()); + printf ("test: OK \n"); } @@ -125,15 +169,22 @@ void runTests(int64_t numElements) HIPCHECK(hipMalloc(&C_d, sizeBytes)); HIPCHECK(hipHostMalloc(&C_h, sizeBytes)); + hipStream_t stream; + HIPCHECK(hipStreamCreateWithFlags(&stream, 0x0)); - { - test (C_d, C_h, numElements, syncNone); - test (C_d, C_h, numElements, syncNullStream); - test (C_d, C_h, numElements, syncOtherStream); - //test (C_d, C_h, numElements, syncDevice); + //for (int waitStart=0; waitStart<2; waitStart++) { + for (int waitStart=1; waitStart>=0; waitStart--) { + unsigned W = waitStart ? 0x1000:0; + test (W | 0x01, C_d, C_h, numElements, 0 , waitStart, syncNone); + test (W | 0x02, C_d, C_h, numElements, stream, waitStart, syncNone); + test (W | 0x04, C_d, C_h, numElements, 0 , waitStart, syncStream); + test (W | 0x08, C_d, C_h, numElements, stream, waitStart, syncStream); + test (W | 0x10, C_d, C_h, numElements, 0, waitStart, syncStopEvent); + test (W | 0x20, C_d, C_h, numElements, stream, waitStart, syncStopEvent); } + HIPCHECK(hipStreamDestroy(stream)); HIPCHECK(hipFree(C_d)); HIPCHECK(hipHostFree(C_h)); } @@ -143,7 +194,7 @@ int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); - runTests(4000000); + runTests(80000000); passed(); } diff --git a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp index b57e120dcc..c6a58ce7d4 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -56,9 +56,27 @@ const char *syncModeString(int syncMode) { }; -void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) +void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { - printf ("\ntest: syncMode=%s expectMismatch=%d\n", syncModeString(syncMode), expectMismatch); + + // This test sends a long-running kernel to the null stream, then tests to see if the + // specified synchronization technique is effective. + // + // Some syncMode are not expected to correctly sync (for example "syncNone"). in these + // cases the test sets expectMismatch and the check logic below will attempt to ensure that + // the undesired synchronization did not occur - ie ensure the kernel is still running and did + // not yet update the stop event. This can be tricky since if the kernel runs fast enough it + // may complete before the check. To prevent this, the addCountReverse has a count parameter + // which causes it to loop repeatedly, and the results are checked in reverse order. + // + // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results + // are checked and we test to make sure stop event has completed. + + if (!(testMask & p_tests)) { + return; + } + printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", + testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); @@ -72,13 +90,15 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expec hipStream_t otherStream = 0; unsigned flags = (syncMode == syncMarkerThenOtherNonBlockingStream) ? hipStreamNonBlocking : hipStreamDefault; HIPCHECK(hipStreamCreateWithFlags(&otherStream, flags)); - hipEvent_t e; - HIPCHECK(hipEventCreate(&e)); + hipEvent_t stop, otherStreamEvent; + HIPCHECK(hipEventCreate(&stop)); + HIPCHECK(hipEventCreate(&otherStreamEvent)); unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count); + HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: @@ -92,7 +112,10 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expec break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: - HIPCHECK(hipEventRecord(e, otherStream)); // this may wait for NULL stream depending hipStreamNonBlocking flag above + + // this may wait for NULL stream depending hipStreamNonBlocking flag above + HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: @@ -102,6 +125,14 @@ void test(int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expec assert(0); }; + hipError_t done = hipEventQuery(stop); + + if (expectMismatch) { + assert (done == hipErrorNotReady); + } else { + assert (done == hipSuccess); + } + int mismatches = 0; int expected = init0 + count; for (int i=0; i Date: Mon, 5 Jun 2017 08:50:41 -0500 Subject: [PATCH 131/171] Enable HIP_SYNC_NULL_STREAM=0 optimization. [ROCm/clr commit: 344b6cb0c0852bbab293a0d618a59274a693f69a] --- projects/clr/hipamd/src/hip_hcc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 0cdc57eaab..08a2cdbfcf 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -94,7 +94,7 @@ int HIP_SYNC_HOST_ALLOC = 1; // Chicken bit to sync on host to implement null stream. // If 0, null stream synchronization is performed on the GPU -int HIP_SYNC_NULL_STREAM = 1; +int HIP_SYNC_NULL_STREAM = 0; // HIP needs to change some behavior based on HCC_OPT_FLUSH : // TODO - set this to 1 From 450a26e5d42d5682771e2cc60010fb840f4cbaf4 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Mon, 5 Jun 2017 11:38:28 -0500 Subject: [PATCH 132/171] Improve HIP kernel names, attributes and codegen, contributed by Alex Voicu Change-Id: I2cafbdc5a98e26c7f4fad84739c915e7dc09993c [ROCm/clr commit: 3b6a863eef6775669356cd34403b98620bf24ddb] --- .../hip/hcc_detail/grid_launch_GGL.hpp | 1278 +++++++++-------- .../include/hip/hcc_detail/hip_runtime.h | 11 +- .../include/hip/hcc_detail/host_defines.h | 2 +- 3 files changed, 660 insertions(+), 631 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index 2dd9a95bc6..8e3dab8482 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -89,8 +89,7 @@ namespace hip_impl dim3 dim_blocks, int group_mem_bytes, const hc::accelerator_view& acc_v, - K k, - Ts&&... args) + K k) { const auto d = hc::extent<3>{ num_blocks.z * dim_blocks.z, @@ -102,16 +101,11 @@ namespace hip_impl group_mem_bytes); try { - hc::parallel_for_each( - acc_v, - d, - [=](const hc::tiled_index<3>& idx) [[hc]] { - k(args...); - }); + hc::parallel_for_each(acc_v, d, k); } catch (std::exception& ex) { - std::cerr << "Failed in " << __FUNCTION__ << ", with exception: " - << ex.what() << std::endl; + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; throw; } } @@ -133,8 +127,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream, const char* kernel_name, - K k, - Ts&&... args) + K k) { void* lck_stream = nullptr; auto acc_v = lock_stream_hip_(stream, lck_stream); @@ -156,12 +149,11 @@ namespace hip_impl std::move(dim_blocks), group_mem_bytes, acc_v, - std::move(k), - std::forward(args)...); + std::move(k)); } catch (std::exception& ex) { - std::cerr << "Failed in " << __FUNCTION__ << ", with exception: " - << ex.what() << std::endl; + std::cerr << "Failed in " << __func__ << ", with exception: " + << ex.what() << std::endl; throw; } } @@ -175,8 +167,7 @@ namespace hip_impl dim3 dim_blocks, int group_mem_bytes, hipStream_t stream, - K k, - Ts&&... args) + K k) { grid_launch_hip_impl_( New_grid_launch_tag{}, @@ -184,9 +175,7 @@ namespace hip_impl std::move(dim_blocks), group_mem_bytes, std::move(stream), - std::move(k), - hipLaunchParm{}, - std::forward(args)...); + std::move(k)); } template @@ -199,8 +188,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream, const char* kernel_name, - K k, - Ts&&... args) + K k) { grid_launch_hip_impl_( New_grid_launch_tag{}, @@ -209,9 +197,7 @@ namespace hip_impl group_mem_bytes, std::move(stream), kernel_name, - std::move(k), - hipLaunchParm{}, - std::forward(args)...); + std::move(k)); } template @@ -223,8 +209,7 @@ namespace hip_impl int group_mem_bytes, hipStream_t stream, const char* kernel_name, - K k, - Ts&& ... args) + K k) { grid_launch_hip_impl_( is_new_grid_launch_t{}, @@ -233,8 +218,7 @@ namespace hip_impl group_mem_bytes, std::move(stream), kernel_name, - std::move(k), - std::forward(args)...); + std::move(k)); } template @@ -245,8 +229,7 @@ namespace hip_impl dim3 dim_blocks, int group_mem_bytes, hipStream_t stream, - K k, - Ts&& ... args) + K k) { grid_launch_hip_impl_( is_new_grid_launch_t{}, @@ -254,610 +237,649 @@ namespace hip_impl std::move(dim_blocks), group_mem_bytes, std::move(stream), - std::move(k), - std::forward(args)...); + std::move(k)); } - namespace - { - template - constexpr - inline - T&& forward_(std::remove_reference_t& x) [[hc]] - { - return static_cast(x); - } + // TODO: these are temporary and purposefully noisy and disruptive. + #define make_kernel_name_hip(k, n)\ + HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ + HIP_kernel_functor_name_end ## _ ## n - template - struct Forwarder { - template - void operator()(Ts&&...args) const [[hc]] - { - k(forward_(args)...); - } - }; - } + #define make_kernel_functor_hip_27(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24)\ + struct make_kernel_name_hip(function_name, 25) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ + }\ + } + #define make_kernel_functor_hip_26(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23)\ + struct make_kernel_name_hip(function_name, 24) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ + }\ + } + #define make_kernel_functor_hip_25(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22)\ + struct make_kernel_name_hip(function_name, 23) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + __attribute__((used, flatten))\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_);\ + }\ + } + #define make_kernel_functor_hip_24(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21)\ + struct make_kernel_name_hip(function_name, 22) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_);\ + }\ + } + #define make_kernel_functor_hip_23(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20)\ + struct make_kernel_name_hip(function_name, 21) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_);\ + }\ + } + #define make_kernel_functor_hip_22(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19)\ + struct make_kernel_name_hip(function_name, 20) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_);\ + }\ + } + #define make_kernel_functor_hip_21(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18)\ + struct make_kernel_name_hip(function_name, 19) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_);\ + }\ + } + #define make_kernel_functor_hip_20(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17)\ + struct make_kernel_name_hip(function_name, 18) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ + }\ + } + #define make_kernel_functor_hip_19(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16)\ + struct make_kernel_name_hip(function_name, 17) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ + }\ + } + #define make_kernel_functor_hip_18(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15)\ + struct make_kernel_name_hip(function_name, 16) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ + }\ + } + #define make_kernel_functor_hip_17(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14)\ + struct make_kernel_name_hip(function_name, 15) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_);\ + }\ + } + #define make_kernel_functor_hip_16(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13)\ + struct make_kernel_name_hip(function_name, 14) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_);\ + }\ + } + #define make_kernel_functor_hip_15(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12)\ + struct make_kernel_name_hip(function_name, 13) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_);\ + }\ + } + #define make_kernel_functor_hip_14(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11)\ + struct make_kernel_name_hip(function_name, 12) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_);\ + }\ + } + #define make_kernel_functor_hip_13(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ + struct make_kernel_name_hip(function_name, 11) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_);\ + }\ + } + #define make_kernel_functor_hip_12(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ + struct make_kernel_name_hip(function_name, 10) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ + _p9_);\ + }\ + } + #define make_kernel_functor_hip_11(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ + struct make_kernel_name_hip(function_name, 9) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ + }\ + } + #define make_kernel_functor_hip_10(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ + struct make_kernel_name_hip(function_name, 8) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ + }\ + } + #define make_kernel_functor_hip_9(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6)\ + struct make_kernel_name_hip(function_name, 7) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ + }\ + } + #define make_kernel_functor_hip_8(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5)\ + struct make_kernel_name_hip(function_name, 6) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ + }\ + } + #define make_kernel_functor_hip_7(\ + function_name, kernel_name, p0, p1, p2, p3, p4)\ + struct make_kernel_name_hip(function_name, 5) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ + }\ + } + #define make_kernel_functor_hip_6(function_name, kernel_name, p0, p1, p2, p3)\ + struct make_kernel_name_hip(function_name, 4) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_, _p3_);\ + }\ + } + #define make_kernel_functor_hip_5(function_name, kernel_name, p0, p1, p2)\ + struct make_kernel_name_hip(function_name, 3) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_, _p2_);\ + }\ + } + #define make_kernel_functor_hip_4(function_name, kernel_name, p0, p1)\ + struct make_kernel_name_hip(function_name, 2) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_, _p1_);\ + }\ + } + #define fofo(f, n) kernel_prefix_hip ## f ## kernel_suffix_hip ## n + #define make_kernel_functor_hip_3(function_name, kernel_name, p0)\ + struct make_kernel_name_hip(function_name, 1) {\ + std::decay_t _p0_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(_p0_);\ + }\ + } + #define make_kernel_functor_hip_2(function_name, kernel_name)\ + struct make_kernel_name_hip(function_name, 0) {\ + void operator()(const hc::tiled_index<3>&) [[hc]]\ + {\ + return kernel_name(hipLaunchParm{});\ + }\ + } + #define make_kernel_functor_hip_1(...) + #define make_kernel_functor_hip_0(...) + #define make_kernel_functor_hip_(...)\ + overload_macro_hip_(make_kernel_functor_hip_, __VA_ARGS__) - template - requires(Domain == {Ts...}) - inline - void grid_launch( - New_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - Ts&&... args) - { - grid_launch_hip_impl_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - Forwarder{}, - std::forward(args)...); - } - template - requires(Domain == {Ts...}) - inline - void grid_launch( - Old_grid_launch_tag, - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - Ts&&... args) - { - grid_launch_hip_( - New_grid_launch_tag{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - hipLaunchParm{}, - std::forward(args)...); - } - - template - requires(Domain == {Ts...}) - inline - std::enable_if_t::value> grid_launch_hip_( - dim3 num_blocks, - dim3 dim_blocks, - int group_mem_bytes, - hipStream_t stream, - Ts&&... args) - { - grid_launch_hip_( - is_new_grid_launch_t{}, - std::move(num_blocks), - std::move(dim_blocks), - group_mem_bytes, - std::move(stream), - std::forward(args)...); - } - - // TODO: these are temporary, they need to be completely removed once we - // enable C++14 support and can have proper generic, variadic lambdas. - #define make_kernel_lambda_hip_26(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21, p22, p23, p24)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_,\ - const std::decay_t& _p22_,\ - const std::decay_t& _p23_,\ - const std::decay_t& _p24_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_);\ - } - #define make_kernel_lambda_hip_25(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21, p22, p23)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_,\ - const std::decay_t& _p22_,\ - const std::decay_t& _p23_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_, _p23_);\ - } - #define make_kernel_lambda_hip_24(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21, p22)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_,\ - const std::decay_t& _p22_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_, _p22_);\ - } - #define make_kernel_lambda_hip_23(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20, p21)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_,\ - const std::decay_t& _p21_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_, _p21_);\ - } - #define make_kernel_lambda_hip_22(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19, p20)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_,\ - const std::decay_t& _p20_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_, _p20_);\ - } - #define make_kernel_lambda_hip_21(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18, p19)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_,\ - const std::decay_t& _p19_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_, _p19_);\ - } - #define make_kernel_lambda_hip_20(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17, p18)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_,\ - const std::decay_t& _p18_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ - _p18_);\ - } - #define make_kernel_lambda_hip_19(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16, p17)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_,\ - const std::decay_t& _p17_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_);\ - } - #define make_kernel_lambda_hip_18(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15,\ - p16)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_,\ - const std::decay_t& _p16_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_);\ - } - #define make_kernel_lambda_hip_17(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14, p15)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_,\ - const std::decay_t& _p15_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_, _p15_);\ - } - #define make_kernel_lambda_hip_16(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13, p14)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_,\ - const std::decay_t& _p14_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_, _p14_);\ - } - #define make_kernel_lambda_hip_15(\ - kernel_name,\ - p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12, p13)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_,\ - const std::decay_t& _p13_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_, _p13_);\ - } - #define make_kernel_lambda_hip_14(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11, p12)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_,\ - const std::decay_t& _p12_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_, _p12_);\ - } - #define make_kernel_lambda_hip_13(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10, p11)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_,\ - const std::decay_t& _p11_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_,\ - _p9_, _p10_, _p11_);\ - } - #define make_kernel_lambda_hip_12(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, p10)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_,\ - const std::decay_t& _p10_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ - _p10_);\ - } - #define make_kernel_lambda_hip_11(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_,\ - const std::decay_t& _p9_) [[hc]] {\ - kernel_name(\ - _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_);\ - } - #define make_kernel_lambda_hip_10(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_,\ - const std::decay_t& _p8_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_);\ - } - #define make_kernel_lambda_hip_9(\ - kernel_name, p0, p1, p2, p3, p4, p5, p6, p7)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_,\ - const std::decay_t& _p7_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_);\ - } - #define make_kernel_lambda_hip_8(kernel_name, p0, p1, p2, p3, p4, p5, p6)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_,\ - const std::decay_t& _p6_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_);\ - } - #define make_kernel_lambda_hip_7(kernel_name, p0, p1, p2, p3, p4, p5)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_,\ - const std::decay_t& _p5_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_, _p5_);\ - } - #define make_kernel_lambda_hip_6(kernel_name, p0, p1, p2, p3, p4)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_,\ - const std::decay_t& _p4_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_, _p4_);\ - } - #define make_kernel_lambda_hip_5(kernel_name, p0, p1, p2, p3)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_,\ - const std::decay_t& _p3_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_, _p3_);\ - } - #define make_kernel_lambda_hip_4(kernel_name, p0, p1, p2)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_,\ - const std::decay_t& _p2_) [[hc]] {\ - kernel_name(_p0_, _p1_, _p2_);\ - } - #define make_kernel_lambda_hip_3(kernel_name, p0, p1)\ - [](const std::decay_t& _p0_,\ - const std::decay_t& _p1_) [[hc]] {\ - kernel_name(_p0_, _p1_);\ - } - #define make_kernel_lambda_hip_2(kernel_name, p0)\ - [](const std::decay_t& _p0_) [[hc]] {\ - kernel_name(_p0_);\ - } - #define make_kernel_lambda_hip_1(kernel_name)\ - []() [[hc]] { return kernel_name(hipLaunchParm{}); } - - #define make_kernel_lambda_hip_(...)\ - overload_macro_hip_(make_kernel_lambda_hip_, __VA_ARGS__) + #define hipLaunchNamedKernelGGL(\ + function_name,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ...)\ + do {\ + make_kernel_functor_hip_(function_name, kernel_name, __VA_ARGS__)\ + hip_kernel_functor_impl_{__VA_ARGS__};\ + hip_impl::grid_launch_hip_(\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + #kernel_name,\ + hip_kernel_functor_impl_);\ + } while(0) #define hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - hip_impl::grid_launch_hip_(\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - #kernel_name,\ - make_kernel_lambda_hip_(kernel_name, __VA_ARGS__),\ - ##__VA_ARGS__);\ - } while(0) + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchNamedKernelGGL(\ + unnamed,\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + ##__VA_ARGS__);\ + } while (0) #define hipLaunchKernel(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - ...)\ - do {\ - hipLaunchKernelGGL(\ - kernel_name,\ - num_blocks,\ - dim_blocks,\ - group_mem_bytes,\ - stream,\ - hipLaunchParm{},\ - ##__VA_ARGS__);\ - } while(0) + kernel_name, num_blocks, dim_blocks, group_mem_bytes, stream, ...)\ + do {\ + hipLaunchKernelGGL(\ + kernel_name,\ + num_blocks,\ + dim_blocks,\ + group_mem_bytes,\ + stream,\ + hipLaunchParm{},\ + ##__VA_ARGS__);\ + } while(0) } #endif //GENERIC_GRID_LAUNCH diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h index 4d8876d8f4..129020d9cd 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -149,8 +149,15 @@ extern int HIP_TRACE_API; #endif /* Device feature flags */ -//TODO-HCC this is currently ignored by HCC target of HIP -#define __launch_bounds__(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) +#define launch_bounds_impl0(requiredMaxThreadsPerBlock)\ + __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock))) +#define launch_bounds_impl1(\ + requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor)\ + __attribute__((amdgpu_flat_work_group_size(1, requiredMaxThreadsPerBlock),\ + amdgpu_waves_per_eu(minBlocksPerMultiprocessor))) +#define select_impl_(_1, _2, impl_, ...) impl_ +#define __launch_bounds__(...) select_impl_(\ + __VA_ARGS__, launch_bounds_impl1, launch_bounds_impl0)(__VA_ARGS__) // Detect if we are compiling C++ mode or C mode #if defined(__cplusplus) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h b/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h index 5864cfa0e7..140cbb0678 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h @@ -48,7 +48,7 @@ THE SOFTWARE. #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else //#warning "GGL global define reached" -#define __global__ __attribute__((hc, weak)) +#define __global__ __attribute__((annotate("hip__global__"), hc, used)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) From daae691cdb331fbe162499efd4574db00829236f Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 7 Jun 2017 00:15:05 -0500 Subject: [PATCH 133/171] Enable HCC_OPT_FLUSH=1. Requires appropriate HCC with this support : commit 38e392b517a46a09a3b1c8f388e6a0db3741c510 [ROCm/clr commit: c2baa4f6e604d23c14e66eb6402fe16bd6e25dab] --- projects/clr/hipamd/src/hip_hcc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 08a2cdbfcf..d826a0cec3 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -98,7 +98,7 @@ int HIP_SYNC_NULL_STREAM = 0; // HIP needs to change some behavior based on HCC_OPT_FLUSH : // TODO - set this to 1 -int HCC_OPT_FLUSH = 0; +int HCC_OPT_FLUSH = 1; From e23be58d91a2c3f22e2eaf42308d0a739e77b03a Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 7 Jun 2017 15:23:37 +0530 Subject: [PATCH 134/171] p2p_copy_coherency test: gracefully handle single gpu case Change-Id: I216663f67ef58c673136332635dab8b57079b909 [ROCm/clr commit: a7dc938ec0f2c0ed04a900128b9a735127662446] --- .../tests/src/runtimeApi/memory/p2p_copy_coherency.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp index 459c0054c9..6bc6235454 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -156,7 +156,13 @@ int main(int argc, char *argv[]) int dev0 = 0; int dev1 = 1; - // TODO - only works on multi-GPU system: + int numDevices; + HIPCHECK(hipGetDeviceCount(&numDevices)); + if (numDevices == 1) { + printf("warning : test requires atleast two gpus\n"); + passed(); + } + if (enablePeers(dev0,dev1) == -1) { printf ("warning : could not find peer gpus\n"); return -1; From 949fbad6e2aa45549616a74715d1c0de85f477ef Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 7 Jun 2017 15:24:44 +0530 Subject: [PATCH 135/171] hipDeviceMemcpy test: make it functional on nvcc path Change-Id: Id10c79b48747ed701adbd0a233c53cd60cfa743b [ROCm/clr commit: a50f5ca0acd1faa1d2c2e4e25c7e0ad5d63a7025] --- projects/clr/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp index 3843c07bb9..527df9bab1 100644 --- a/projects/clr/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/deviceLib/hipDeviceMemcpy.cpp @@ -4,7 +4,7 @@ #include "../test_common.h" -#define LEN 1030 +#define LEN 1024 #define SIZE LEN << 2 /* HIT_START @@ -17,13 +17,13 @@ __global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In) { int tx = hipThreadIdx_x; - memcpy(Out + tx, In + tx, SIZE/LEN); + memcpy(Out + tx, In + tx, sizeof(uint32_t)); } __global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size) { int tx = hipThreadIdx_x; - memset(ptr + tx, val, size); + memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN))); } int main() From 57bfd56a0d9fc7939ac41b9b876fb655f7a4c783 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 7 Jun 2017 15:25:54 +0530 Subject: [PATCH 136/171] hipMemcpy-size test: reduce max size to make it work correctly on nvcc path Change-Id: I9ce9f5a9e141ffd8ddf961269010b33358e02771 [ROCm/clr commit: ff8ade59aac19017d57272829335a3d701a418ab] --- projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp index 749ec0de77..e8e803e44c 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -304,7 +304,7 @@ void memcpytest2_sizes(size_t maxElem=0) HIPCHECK(hipMemGetInfo(&free, &total)); if (maxElem == 0) { - maxElem = free/sizeof(T)/5; + maxElem = free/sizeof(T)/8; } printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", From aeb85a76ef5d5337936711519ed70d60f1409ab9 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 7 Jun 2017 15:50:28 +0530 Subject: [PATCH 137/171] hip_hcc package: add libstdc++-static as a rpm dependency Change-Id: I83a79353492a6be3d788b7c0ce4a8f3aa740d9d9 [ROCm/clr commit: ff4fae7d204aaa2c30f4e0a35e9ba7da2f2b7c32] --- projects/clr/hipamd/packaging/hip_hcc.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/packaging/hip_hcc.txt b/projects/clr/hipamd/packaging/hip_hcc.txt index b0808aa0bc..284d97e2e5 100644 --- a/projects/clr/hipamd/packaging/hip_hcc.txt +++ b/projects/clr/hipamd/packaging/hip_hcc.txt @@ -42,9 +42,9 @@ set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") if(@COMPILE_HIP_ATP_MARKER@) - set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler") + set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler, libstdc++-static") else() - set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@") + set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, libstdc++-static") endif() set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") From 2408d0cf7955a659558a39545aac7d3cf4a880d2 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Wed, 7 Jun 2017 09:05:30 -0500 Subject: [PATCH 138/171] Use amHostCoherentFlag. Requires new HCC version. [ROCm/clr commit: 9bfc7b0e139c8f84b6d29316b84f65250a077b9b] --- projects/clr/hipamd/src/hip_memory.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 3ab7713afa..c04c2611c3 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -243,6 +243,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) } + hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { HIP_INIT_SPECIAL_API((TRACE_MEM), ptr, sizeBytes, flags); @@ -289,10 +290,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) if (flags & hipHostMallocCoherent) { amFlags = amHostCoherent; } else if (flags & hipHostMallocNonCoherent) { - amFlags = amHostPinned; + amFlags = amHostNonCoherent; } else { // depends on env variables: - amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostPinned; + amFlags = HIP_COHERENT_HOST_ALLOC ? amHostCoherent : amHostNonCoherent; } From 65ead764d3cf4009a3e75548f85461e506bc14ef Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Thu, 8 Jun 2017 19:20:10 -0500 Subject: [PATCH 139/171] Add clang version guard so the hip_fp16.h header won't be picked up by gcc Change-Id: Ia21335a455bc93210901b44bc8c76a7f4a385b55 [ROCm/clr commit: 5450021f93e71cc87312f5891985fa729668fb88] --- projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h index b1ecc61cb0..4d90ec82b2 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_fp16.h @@ -24,7 +24,7 @@ THE SOFTWARE. #define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_FP16_H #include "hip/hcc_detail/hip_vector_types.h" - +#if ( __clang_major__ > 3) typedef __fp16 __half; typedef __fp16 __half1 __attribute__((ext_vector_type(1))); typedef __fp16 __half2 __attribute__((ext_vector_type(2))); @@ -454,6 +454,6 @@ __device__ static inline __half2 h2trunc(const __half2 h) { a.xy = __hip_hc_ir_h2trunc_int(h.xy); return a; } - +#endif //clang_major > 3 #endif From 14e51d052e30ac7c3ae7b499c9a43d46d724e8ad Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Thu, 8 Jun 2017 19:24:22 -0500 Subject: [PATCH 140/171] Fix error related to undefined reference of __get_dynamicgroupbaseptr(). Change-Id: I14951e1725e35dd5f5e53805f81cdb58661f59f2 [ROCm/clr commit: 682dda4418fef276c693f29b47ad21a6d1567088] --- projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h | 8 ++++---- projects/clr/hipamd/src/device_util.cpp | 6 ++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h index 129020d9cd..95826f9b60 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -305,7 +305,7 @@ __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask __host__ __device__ int min(int arg1, int arg2); __host__ __device__ int max(int arg1, int arg2); -__device__ ADDRESS_SPACE_3 void* __get_dynamicgroupbaseptr(); +__device__ void* __get_dynamicgroupbaseptr(); /** @@ -464,10 +464,10 @@ do {\ // Macro to replace extern __shared__ declarations // to local variable definitions #define HIP_DYNAMIC_SHARED(type, var) \ - ADDRESS_SPACE_3 type* var = \ - (ADDRESS_SPACE_3 type*)__get_dynamicgroupbaseptr(); \ + type* var = \ + (type*)__get_dynamicgroupbaseptr(); \ -#define HIP_DYNAMIC_SHARED_ATTRIBUTE ADDRESS_SPACE_3 +#define HIP_DYNAMIC_SHARED_ATTRIBUTE diff --git a/projects/clr/hipamd/src/device_util.cpp b/projects/clr/hipamd/src/device_util.cpp index e59a44e5ba..062372f0f4 100644 --- a/projects/clr/hipamd/src/device_util.cpp +++ b/projects/clr/hipamd/src/device_util.cpp @@ -1101,11 +1101,13 @@ __host__ __device__ int max(int arg1, int arg2) return (int)(hc::precise_math::fmax((float)arg1, (float)arg2)); } -__device__ ADDRESS_SPACE_3 void* __get_dynamicgroupbaseptr() -{ +__device__ void* __get_dynamicgroupbaseptr() { return hc::get_dynamic_group_segment_base_pointer(); } +__host__ void* __get_dynamicgroupbaseptr() { + return nullptr; +} // Precise Math Functions __device__ float __hip_precise_cosf(float x) { From cdd3846478a08df34a814fb5735239a08148a393 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 12 Jun 2017 09:57:17 +0530 Subject: [PATCH 141/171] Initial implementation of hipify-cmakefile Change-Id: Id365da9f887b5c3409639f000b430d093fd4f6b3 [ROCm/clr commit: c5366a55f10b4041d16da8cd386b00cb6ccdb9fc] --- projects/clr/hipamd/bin/hipify-cmakefile | 279 +++++++++++++++++++++++ 1 file changed, 279 insertions(+) create mode 100755 projects/clr/hipamd/bin/hipify-cmakefile diff --git a/projects/clr/hipamd/bin/hipify-cmakefile b/projects/clr/hipamd/bin/hipify-cmakefile new file mode 100755 index 0000000000..b11de4adc1 --- /dev/null +++ b/projects/clr/hipamd/bin/hipify-cmakefile @@ -0,0 +1,279 @@ +#!/usr/bin/perl -w +## +# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +## +#usage hipify-cmakefile [OPTIONS] INPUT_FILE +use Getopt::Long; + +GetOptions( + "print-stats" => \$print_stats # print the command-line, like a header. + , "quiet-warnings" => \$quiet_warnings # don't print warnings on unknown CUDA functions. + , "no-output" => \$no_output # don't write any translated output to stdout. + , "inplace" => \$inplace # modify input file inplace, save backup in ".prehip" file. + , "n" => \$n # combination of print_stats + no-output. +); + +$print_stats = 1 if $n; +$no_output = 1 if $n; + +@warn_whitelist = (); + +#--- +#Stats tracking code: +@statNames = ( "macro", "include", "option", "other" ); + +#--- +#Compute total of all individual counts: +sub totalStats { + my %count = %{ shift() }; + + my $total = 0; + foreach $key ( keys %count ) { + $total += $count{$key}; + } + + return $total; +} + +#--- +sub printStats { + my $label = shift(); + my @statNames = @{ shift() }; + my %counts = %{ shift() }; + my $warnings = shift(); + my $loc = shift(); + + my $total = totalStats( \%counts ); + + printf STDERR "%s %d CUDA->HIP refs( ", $label, $total; + + foreach $stat (@statNames) { + printf STDERR "%s:%d ", $stat, $counts{$stat}; + } + + printf STDERR ") warn:%d LOC:%d", $warnings, $loc; +} + +#--- +# Add adder stats to dest. Used to add stats for current file to a running total for all files: +sub addStats { + my $dest_ref = shift(); + my %adder = %{ shift() }; + + foreach $key ( keys %adder ) { + $dest_ref->{$key} += $adder{$key}; + } +} + +#--- +sub clearStats { + my $dest_ref = shift(); + my @statNames = @{ shift() }; + + foreach $stat (@statNames) { + $dest_ref->{$stat} = 0; + } +} + +# count of transforms in all files: +my %tt; +clearStats( \%tt, \@statNames ); + +my $fileCount = @ARGV; +my $fileName = ""; + +while (@ARGV) { + $fileName = shift(@ARGV); + if ($inplace) { + my $file_prehip = "$fileName" . ".prehip"; + my $infile; + my $outfile; + if ( -e $file_prehip ) { + $infile = $file_prehip; + $outfile = $fileName; + } + else { + system("cp $fileName $file_prehip"); + $infile = $file_prehip; + $outfile = $fileName; + } + open( INFILE, "<", $infile ) or die "error: could not open $infile"; + open( OUTFILE, ">", $outfile ) or die "error: could not open $outfile"; + $OUTFILE = OUTFILE; + } + else { + open( INFILE, "<", $fileName ) or die "error: could not open $fileName"; + $OUTFILE = STDOUT; + } + + # count of transforms in this file, init to 0 here: + my %ft; + clearStats( \%ft, \@statNames ); + + my $lineCount = 0; + + undef $/; # Read whole file at once, so we can match newlines. + while () { + + # Replace find_package(CUDA) with find_package(HIP) + $ft{'include'} += s/\bfind_package[ ]*\([ ]*CUDA[ ]*[0-9.]*/find_package(HIP/ig; + + # Replace macros + $ft{'macro'} += s/\bCUDA_ADD_EXECUTABLE/HIP_ADD_EXECUTABLE/ig; + $ft{'macro'} += s/\bCUDA_ADD_LIBRARY/HIP_ADD_LIBRARY/ig; + $ft{'macro'} += s/\bCUDA_INCLUDE_DIRECTORIES/HIP_INCLUDE_DIRECTORIES/ig; + + # Replace options + $ft{'option'} += s/\bCUDA_NVCC_FLAGS/HIP_NVCC_FLAGS/ig; + $ft{'option'} += s/\bCUDA_HOST_COMPILATION_CPP/HIP_HOST_COMPILATION_CPP/ig; + $ft{'option'} += s/\bCUDA_SOURCE_PROPERTY_FORMAT/HIP_SOURCE_PROPERTY_FORMAT/ig; + + # Replace variables + $ft{'other'} += s/\bCUDA_FOUND/HIP_FOUND/ig; + $ft{'other'} += s/\bCUDA_VERSION/HIP_VERSION/ig; + $ft{'other'} += s/\bCUDA_TOOLKIT_ROOT_DIR/HIP_ROOT_DIR/ig; + + unless ($quiet_warnings) { + + #print STDERR "Check WARNINGs\n"; + # copy into array of lines, process line-by-line to show warnings: + my @lines = split /\n/, $_; + my $tmp = $_; # copies the whole file, could be a little smarter here... + my $line_num = 0; + + foreach (@lines) { + $line_num++; + + # remove any whitelisted words: + foreach $w (@warn_whitelist) { + s/\b$w\b/ZAP/; + } + + $s = warnUnsupportedSpecialFunctions($line_num); + $warnings += $s; + } + + $_ = $tmp; + } + + #-------- + # Print it! + unless ($no_output) { + print $OUTFILE "$_"; + } + $lineCount = $_ =~ tr/\n//; + } + + my $totalConverted = totalStats( \%ft ); + + if ( ( $totalConverted + $warnings ) and $print_stats ) { + printStats( "info: converted", \@statNames, \%ft, $warnings, $lineCount ); + print STDERR " in '$fileName'\n"; + print STDERR "You may need to hand-edit '$fileName' to add steps to build correctly on HCC path\n"; + } + + # Update totals for all files: + addStats( \%tt, \%ft ); + $Twarnings += $warnings; + $TlineCount += $lineCount; +} + +#-- Print total stats for all files processed: +if ( $print_stats and ( $fileCount > 1 ) ) { + print STDERR "\n"; + printStats( "info: TOTAL-converted", \@statNames, \%tt, $Twarnings, $TlineCount ); + print STDERR "\n"; +} + +#--- +sub warnUnsupportedSpecialFunctions { + my $line_num = shift; + my $m = 0; + + foreach $func ( + # macros: + "CUDA_ADD_CUFFT_TO_TARGET", + "CUDA_ADD_CUBLAS_TO_TARGET", + #"CUDA_ADD_EXECUTABLE", + #"CUDA_ADD_LIBRARY", + "CUDA_BUILD_CLEAN_TARGET", + "CUDA_COMPILE", + "CUDA_COMPILE_PTX", + "CUDA_COMPILE_FATBIN", + "CUDA_COMPILE_CUBIN", + "CUDA_COMPUTE_SEPARABLE_COMPILATION_OBJECT_FILE_NAME", + #"CUDA_INCLUDE_DIRECTORIES", + "CUDA_LINK_SEPARABLE_COMPILATION_OBJECTS", + "CUDA_SELECT_NVCC_ARCH_FLAGS", + "CUDA_WRAP_SRCS", + + # options: + "CUDA_64_BIT_DEVICE_CODE", + "CUDA_ATTACH_VS_BUILD_RULE_TO_CUDA_FILE", + "CUDA_BUILD_CUBIN", + "CUDA_BUILD_EMULATION", + "CUDA_LINK_LIBRARIES_KEYWORD", + "CUDA_GENERATED_OUTPUT_DIR", + #"CUDA_HOST_COMPILATION_CPP", + "CUDA_HOST_COMPILER", + #"CUDA_NVCC_FLAGS", + #"CUDA_NVCC_FLAGS_", + "CUDA_PROPAGATE_HOST_FLAGS", + "CUDA_SEPARABLE_COMPILATION", + #"CUDA_SOURCE_PROPERTY_FORMAT", + "CUDA_USE_STATIC_CUDA_RUNTIME", + "CUDA_VERBOSE_BUILD", + + # others: + #"CUDA_VERSION_MAJOR", + #"CUDA_VERSION_MINOR", + #"CUDA_VERSION", + #"CUDA_VERSION_STRING", + "CUDA_HAS_FP16", + #"CUDA_TOOLKIT_ROOT_DIR", + "CUDA_SDK_ROOT_DIR", + "CUDA_INCLUDE_DIRS", + "CUDA_LIBRARIES", + "CUDA_CUFFT_LIBRARIES", + "CUDA_CUBLAS_LIBRARIES", + "CUDA_cudart_static_LIBRARY", + "CUDA_cudadevrt_LIBRARY", + "CUDA_cupti_LIBRARY", + "CUDA_curand_LIBRARY", + "CUDA_cusolver_LIBRARY", + "CUDA_cusparse_LIBRARY", + "CUDA_npp_LIBRARY", + "CUDA_nppc_LIBRARY", + "CUDA_nppi_LIBRARY", + "CUDA_npps_LIBRARY", + "CUDA_nvcuvenc_LIBRARY", + "CUDA_nvcuvid_LIBRARY" + ) + { + my $mt = m/\b($func)/g; + if ($mt) { + $m += $mt; + print STDERR " warning: $fileName:#$line_num : unsupported macro/option : $_\n"; + } + } + + return $m; +} From 2f1e4e84a2f4ad971994445af7aa67679ddab711 Mon Sep 17 00:00:00 2001 From: Patrick Flick Date: Sun, 4 Jun 2017 10:24:00 -0400 Subject: [PATCH 142/171] fix typo [ROCm/clr commit: 821c238badd67e4c5b550e8a5f10b2618e79b985] --- projects/clr/hipamd/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/README.md b/projects/clr/hipamd/README.md index d04d63714f..565fd6a36d 100644 --- a/projects/clr/hipamd/README.md +++ b/projects/clr/hipamd/README.md @@ -134,7 +134,7 @@ The README with the procedures and tips the team used during this porting effort * **bin**: Tools and scripts to help with hip porting * **hipify** : Tool to convert CUDA code to portable CPP. Converts CUDA APIs and kernel builtins. - * **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc ill call nvcc or hcc depending on platform, and include appropriate platform-specific headers and libraries. + * **hipcc** : Compiler driver that can be used to replace nvcc in existing CUDA code. hipcc will call nvcc or hcc depending on platform, and include appropriate platform-specific headers and libraries. * **hipconfig** : Print HIP configuration (HIP_PATH, HIP_PLATFORM, CXX config flags, etc) * **hipexamine.sh** : Script to scan directory, find all code, and report statistics on how much can be ported with HIP (and identify likely features not yet supported) From 4760a21b5519c475d269843887c5e63b8625f8dd Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 12 Jun 2017 11:19:55 +0530 Subject: [PATCH 143/171] Update directed tests README.md Change-Id: I395245454d376508f04e5a4a62c8933895cb3867 [ROCm/clr commit: 15a346463026c6484825a663d3b439b770c0ee49] --- projects/clr/hipamd/tests/README.md | 86 +++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 22 deletions(-) diff --git a/projects/clr/hipamd/tests/README.md b/projects/clr/hipamd/tests/README.md index cb41cc10cd..27cde7c534 100644 --- a/projects/clr/hipamd/tests/README.md +++ b/projects/clr/hipamd/tests/README.md @@ -1,39 +1,78 @@ # HIP testing environment. -This document explains how to use the HIP CMAKE testing environment. +This document explains how to use the HIP CMAKE testing environment. +We make use of the HIT Integrated Tester (HIT) framework to automatically find and add test cases to the CMAKE testing environment. ### Quickstart -Usage : + +HIP unit tests are integrated into the top-level cmake project. The tests depend upon the installed version of HIP. +Typical usage (paths relative to top of the HIP repo): ``` $ mkdir build $ cd build -$ cmake ../src +$ cmake .. -DCMAKE_INSTALL_PREFIX=$PWD/install $ make +$ make install +$ make build_tests $ make test ``` ### How to add a new test -The tests/src/runtimeApi/memory/hipMemtest.cpp file contains a simple unit test and is a good starting point for other tests. -Copy this to a new test name and modify tests/src/CMakefiles.txt to add the test to the build environment. - -Recent versions of the test infrastructure use a hierarchy of folders. Each folder contains src and CMakefiles.txt file. -See the CMakefiles.txt files for description of the intended purpose for each sub-directory. +The test infrastructure use a hierarchy of folders. So add the new test to the appropriate folder. +The tests/src/runtimeApi/memory/hipMemset.cpp file contains a simple unit test and is a good starting point for other tests. +Copy this to a new test name and modify it. -#### Edit CMakefiles.txt: -// Example: +### HIP Integrated Tester (HIT) + +The HIT framework sutomatically finds and adds test cases to the CMAKE testing environment. It achives this by parsing all files in the tests/src folder. +The parser looks for a code block similar to the one below. ``` -# Build the test executable: -build_hip_executable (hipMemset hipMemset.cpp) - - -# This runs the tests with the specified command-line testing. -# Multiple make_test may be specified. -make_test(hipMemset " ") +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * //Small copy + * RUN: %t -N 10 --memsetval 0x42 + * // Oddball size + * RUN: %t -N 10013 --memsetval 0x5a + * // Big copy + * RUN: %t -N 256M --memsetval 0xa6 + * HIT_END + */ ``` +In the above, BUILD commands provide instructions on how to build the test case while RUN commands provide instructions on how to execute the test case. -It is recommended to place the build and run steps adjacent in the CMakefiles.txt. +#### BUILD command + +The supported syntax for the BUILD command is: +``` +BUILD: %t %s HIPCC_OPTIONS HCC_OPTIONS NVCC_OPTIONS EXCLUDE_HIP_PLATFORM +``` +%s: refers to current source file name. Additional source files needed for the test can be specified by name (including relative path). +%t: refers to target executable named derived by removing the extension from the current source file. Alternatively a target executable name can be specified. +HIPCC_OPTIONS: All options specified after this delimiter are passed to hipcc on both HCC and NVCC platforms. +HCC_OPTIONS: All options specified after this delimiter are passed to hipcc on HCC platform only. +NVCC_OPTIONS: All options specified after this delimiter are passed to hipcc on NVCC platform only. +EXCLUDE_HIP_PLATFORM: This can be used to exclude a test case from HCC, NVCC or both platforms. + + +#### RUN command + +The supported syntax for the RUN command is: +``` +RUN: %t EXCLUDE_HIP_PLATFORM +``` +%t: refers to target executable named derived by removing the extension from the current source file. Alternatively a target executable name can be specified. +EXCLUDE_HIP_PLATFORM: This can be used to exclude a test case from HCC, NVCC or both platforms. Note that if the test has been excluded for a specific platform in the BUILD command, it is automatically excluded from the RUN command as well for the same platform. + + +#### RUN_NAMED command + +When using the RUN command, HIT will squash and append the arguments specified to the test executable name to generate the CMAKE test name. Sometimes we might want to specify a more descriptive name. The RUN_NAMED command is used for that. The supported syntax for the RUN_NAMED command is: +``` +RUN: %t CMAKE_TEST_NAME EXCLUDE_HIP_PLATFORM +``` ### Running tests: @@ -43,11 +82,14 @@ ctest ### Run subsets of all tests: ``` -# Run one test on the commandline (obtain commandline parms from CMakefiles.tst) -./hipMemset +# Run one test on the commandline +./directed_tests/runtime/memory/hipMemset -# Run all the memory tests: +# Run all the hipMemcpy tests: ctest -R Memcpy + +# Run all tests in a specific folder: +ctest -R memory ``` @@ -55,7 +97,7 @@ ctest -R Memcpy Find the test and commandline that fail: -(From the test build directory, perhaps hip/tests/build) +(From the build directory, perhaps hip/build) grep -IR hipMemcpy-modes -IR ../tests/ ../tests/src/runtimeApi/memory/hipMemcpy.cpp: * RUN_NAMED: %t hipMemcpy-modes --tests 0x1 From a11bc9fe4abaf2330e8ce90f02a197e405650fe5 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Mon, 12 Jun 2017 11:20:28 +0530 Subject: [PATCH 144/171] Updated RELEASE.md Change-Id: Ic451612555c66f3ed7131514fc97fcc41091370a [ROCm/clr commit: 6174e69f871cb55e5dc57ddb4241c65f35e88b59] --- projects/clr/hipamd/RELEASE.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/projects/clr/hipamd/RELEASE.md b/projects/clr/hipamd/RELEASE.md index 21fd8da7bb..5787c59881 100644 --- a/projects/clr/hipamd/RELEASE.md +++ b/projects/clr/hipamd/RELEASE.md @@ -13,6 +13,15 @@ Upcoming: ## Revision History: +=================================================================================================== +- new APIs: hipMemcpy2DAsync, hipMallocPitch, hipHostMallocCoherent, hipHostMallocNonCoherent +- added support for building hipify-clang using clang 3.9 +- hipify-clang updates for CUDA 8.0 runtime+driver support +- renamed hipify to hipify-perl +- initial implementation of hipify-cmakefile +- several documentation updates & bug fixes + + =================================================================================================== Release: 1.0.17102 Date: 2017.03.07 From 942fc5de3f62e0feca06fca2999fda86cbb8f07c Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Mon, 12 Jun 2017 11:53:25 +0530 Subject: [PATCH 145/171] Update P2P test for using memcpy and kernel tests Change-Id: Ib0f8fc9425e6e85fd11d7d02395c52bc713dcb37 [ROCm/clr commit: ce6e45567790704c2a3918155371bdeee2908425] --- .../runtimeApi/memory/p2p_copy_coherency.cpp | 90 +++++++++++++------ 1 file changed, 61 insertions(+), 29 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp index 6bc6235454..a5d79464d0 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -33,10 +33,14 @@ THE SOFTWARE. #ifdef __HIP_PLATFORM_HCC__ #include +#define USE_HCC_MEMTRACKER 0 #endif #define USE_HSA_COPY 1 +int elementSizes[] = {16, 1024,524288}; +int nSizes = sizeof(elementSizes) / sizeof(int); + int enablePeers(int dev0, int dev1) { int canAccessPeer01, canAccessPeer10; @@ -54,16 +58,25 @@ int enablePeers(int dev0, int dev1) return 0; }; - __global__ void -memsetIntKernel(int * ptr, int val, size_t numElements) +memsetIntKernel(/*hipLaunchParm lp,*/ int * ptr, const int val, size_t numElements) { int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - if (gid < numElements) { - ptr[gid] = val; + int stride = hipBlockDim_x * hipGridDim_x ; + for (size_t i= gid; i< numElements; i+=stride){ + ptr[i] = val; } }; +__global__ void +memcpyIntKernel(/*hipLaunchParm lp, */const int * src, int* dst, size_t numElements) +{ + int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); + int stride = hipBlockDim_x * hipGridDim_x ; + for (size_t i= gid; i< numElements; i+=stride){ + dst[i] = src[i]; + } +}; void checkReverse(const int *ptr, int numElements, int expected) { for (int i=numElements-1; i>=0; i--) { @@ -76,52 +89,66 @@ void checkReverse(const int *ptr, int numElements, int expected) { printf ("test: OK\n"); } - -void runTest(bool stepAIsCopy, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, - int * dataGpu0, int *dataGpu1, int *dataHost, int expected) +void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, + int * dataGpu0_0, int * dataGpu0_1, int *dataGpu1, int *dataHost, int expected) { hipEvent_t e; - HIPCHECK(hipEventCreateWithFlags(&e,0)); - - printf ("test: runTest with %s\n", stepAIsCopy ? "copy" : "kernel"); + if(!hostSync) { + HIPCHECK(hipEventCreateWithFlags(&e,0)); + } const size_t sizeElements = numElements * sizeof(int); + printf ("test: runTest with %zu bytes %s with hostSync %s\n", sizeElements, stepAIsCopy ? "copy" : "kernel", hostSync ? "enabled" : "disabled"); hipStream_t stepAStream = gpu0Stream; if (stepAIsCopy) { #ifdef USE_HSA_COPY - HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0, sizeElements, hipMemcpyDeviceToDevice, stepAStream)); + HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0_0, sizeElements, hipMemcpyDeviceToDevice, stepAStream)); #endif } else { - assert(0); // not yet supported. + //assert(0); // not yet supported. + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, + dataGpu0_0, dataGpu1, numElements); } - HIPCHECK(hipEventRecord(e, stepAStream)); - HIPCHECK(hipStreamWaitEvent(gpu1Stream, e, 0)); + if(!hostSync) { + HIPCHECK(hipEventRecord(e, stepAStream)); + HIPCHECK(hipStreamWaitEvent(gpu1Stream, e, 0)); + } else { + HIPCHECK(hipStreamSynchronize(stepAStream)); + } - HIPCHECK(hipMemcpyAsync(dataHost, dataGpu1, sizeElements, hipMemcpyDeviceToHost, gpu1Stream)); + HIPCHECK(hipMemcpyAsync(dataGpu0_1, dataGpu1, sizeElements, hipMemcpyDeviceToDevice, gpu1Stream)); - HIPCHECK(hipStreamSynchronize(gpu1Stream)); + if(!hostSync) { + HIPCHECK(hipEventRecord(e, gpu1Stream)); + } else { + HIPCHECK(hipStreamSynchronize(gpu1Stream)); + } + + HIPCHECK(hipMemcpyAsync(dataHost, dataGpu0_1, sizeElements, hipMemcpyDeviceToHost, gpu0Stream)); + HIPCHECK(hipStreamSynchronize(gpu0Stream)); checkReverse(dataHost, numElements, expected); } - -void testMultiGpu0(int dev0, int dev1, int numElements) +void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync, bool useMemcpy) { const size_t sizeElements = numElements * sizeof(int); - int * dataGpu0, *dataGpu1, *dataHost; + int * dataGpu0_0, * dataGpu0_1, *dataGpu1, *dataHost; hipStream_t gpu0Stream, gpu1Stream; const int expected = 42; unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); HIPCHECK(hipSetDevice(dev0)); - HIPCHECK(hipMalloc(&dataGpu0, sizeElements)); + HIPCHECK(hipMalloc(&dataGpu0_0, sizeElements)); + HIPCHECK(hipMalloc(&dataGpu0_1, sizeElements)); HIPCHECK(hipStreamCreate(&gpu0Stream)); hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, - dataGpu0, expected, numElements); + dataGpu0_0, expected, numElements); HIPCHECK(hipDeviceSynchronize()); @@ -135,18 +162,19 @@ void testMultiGpu0(int dev0, int dev1, int numElements) HIPCHECK(hipHostMalloc(&dataHost, sizeElements)); memset(dataHost, 13, sizeElements); -#ifdef __HIP_PLATFORM_HCC__ +#if USE_HCC_MEMTRACKER hc::am_memtracker_print(0x0); #endif - + printf (" test: init complete\n"); + runTest(useMemcpy , hostSync, gpu0Stream, gpu1Stream, numElements, dataGpu0_0,dataGpu0_1, dataGpu1, dataHost, expected); - runTest(true/*stepAIsCopy*/, gpu0Stream, gpu1Stream, numElements, dataGpu0, dataGpu1, dataHost, expected); - + HIPCHECK(hipFree(dataGpu0_0)); + HIPCHECK(hipFree(dataGpu0_1)); + HIPCHECK(hipFree(dataGpu1)); + HIPCHECK(hipHostFree(dataHost)); }; - - int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); @@ -168,8 +196,12 @@ int main(int argc, char *argv[]) return -1; }; - //testMultiGpu0(dev0, dev1, numElements); - + for(int index = 1;index < nSizes;index++) { + testMultiGpu(dev0, dev1, elementSizes[index] , false /* GPU Synchronization*/, true); + testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/, true); + testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/, false); + testMultiGpu(dev0, dev1, elementSizes[index] , false /*Host Synchronization*/, false); + } passed(); From f48cbf564a0bc34dff00969a11534746f4e109c1 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 13 Jun 2017 13:35:50 +0530 Subject: [PATCH 146/171] Input args NULL check in hipChooseDevice Change-Id: I1a7b8cded2f81d739645bbf3dab2f04bb9c3c796 [ROCm/clr commit: 0efd737767a56cf7d28b6a0cfbe2774cbe934e08] --- projects/clr/hipamd/src/hip_device.cpp | 110 +++++++++++++------------ 1 file changed, 58 insertions(+), 52 deletions(-) diff --git a/projects/clr/hipamd/src/hip_device.cpp b/projects/clr/hipamd/src/hip_device.cpp index 93c1c20484..05db4c2b30 100644 --- a/projects/clr/hipamd/src/hip_device.cpp +++ b/projects/clr/hipamd/src/hip_device.cpp @@ -415,72 +415,78 @@ hipError_t hipChooseDevice( int* device, const hipDeviceProp_t* prop ) int inPropCount = 0; int matchedPropCount = 0; hipError_t e = hipSuccess; - ihipGetDeviceCount( &deviceCount ); - *device = 0; - for (int i = 0; i < deviceCount; i++) { - ihipGetDeviceProperties( &tempProp, i ); - if(prop->major != 0) { - inPropCount++; - if(tempProp.major >= prop->major) { - matchedPropCount++; - } - if(prop->minor != 0) { + if((device == NULL) || (prop == NULL)) { + e = hipErrorInvalidValue; + } + if(e == hipSuccess) { + ihipGetDeviceCount( &deviceCount ); + *device = 0; + for (int i = 0; i < deviceCount; i++) { + ihipGetDeviceProperties( &tempProp, i ); + if(prop->major != 0) { inPropCount++; - if(tempProp.minor >= prop->minor) { - matchedPropCount++; - } + if(tempProp.major >= prop->major) { + matchedPropCount++; + } + if(prop->minor != 0) { + inPropCount++; + if(tempProp.minor >= prop->minor) { + matchedPropCount++; + } + } } - } - if(prop->totalGlobalMem != 0) { - inPropCount++; - if(tempProp.totalGlobalMem >= prop->totalGlobalMem) { - matchedPropCount++; + if(prop->totalGlobalMem != 0) { + inPropCount++; + if(tempProp.totalGlobalMem >= prop->totalGlobalMem) { + matchedPropCount++; + } } - } - if(prop->sharedMemPerBlock != 0) { - inPropCount++; - if(tempProp.sharedMemPerBlock >= prop->sharedMemPerBlock) { - matchedPropCount++; + if(prop->sharedMemPerBlock != 0) { + inPropCount++; + if(tempProp.sharedMemPerBlock >= prop->sharedMemPerBlock) { + matchedPropCount++; + } } - } - if(prop->maxThreadsPerBlock != 0) { - inPropCount++; - if(tempProp.maxThreadsPerBlock >= prop->maxThreadsPerBlock ) { - matchedPropCount++; + if(prop->maxThreadsPerBlock != 0) { + inPropCount++; + if(tempProp.maxThreadsPerBlock >= prop->maxThreadsPerBlock ) { + matchedPropCount++; + } } - } - if(prop->totalConstMem != 0) { - inPropCount++; - if(tempProp.totalConstMem >= prop->totalConstMem ) { - matchedPropCount++; + if(prop->totalConstMem != 0) { + inPropCount++; + if(tempProp.totalConstMem >= prop->totalConstMem ) { + matchedPropCount++; + } } - } - if(prop->multiProcessorCount != 0) { - inPropCount++; - if(tempProp.multiProcessorCount >= prop->multiProcessorCount ) { - matchedPropCount++; + if(prop->multiProcessorCount != 0) { + inPropCount++; + if(tempProp.multiProcessorCount >= prop->multiProcessorCount ) { + matchedPropCount++; + } } - } - if(prop->maxThreadsPerMultiProcessor != 0) { - inPropCount++; - if(tempProp.maxThreadsPerMultiProcessor >= prop->maxThreadsPerMultiProcessor ) { - matchedPropCount++; + if(prop->maxThreadsPerMultiProcessor != 0) { + inPropCount++; + if(tempProp.maxThreadsPerMultiProcessor >= prop->maxThreadsPerMultiProcessor ) { + matchedPropCount++; + } } - } - if(prop->memoryClockRate != 0) { - inPropCount++; - if(tempProp.memoryClockRate >= prop->memoryClockRate ) { - matchedPropCount++; + if(prop->memoryClockRate != 0) { + inPropCount++; + if(tempProp.memoryClockRate >= prop->memoryClockRate ) { + matchedPropCount++; + } + } + if(inPropCount == matchedPropCount) { + *device = i; } - } - if(inPropCount == matchedPropCount) { - *device = i; - } #if 0 else{ e= hipErrorInvalidValue; } #endif + } } return ihipLogStatus(e); } + From cfe2dc9fc090c2ed071b3be5c6aec39f47356c7a Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 13 Jun 2017 20:25:11 +0300 Subject: [PATCH 147/171] [HIPIFY] Initial sync HIPIFY with HIP by CUDA Driver API functions. + CUDA_Driver_API_functions_supported_by_HIP.md update. + Initial update of HIPIFY with CUDA driver API functions: 1.Error Handling, 2.Initialization, 3.Version Management, 5-6.Device Management, 7.Primary Context, 8-9.Context, 10.Module Management, 11.Memory Management. + Sync HIP functions against CUDA Driver and Runtime API functions. + Typo fixes. ToDo: 12-30 modules of CUDA Driver API. [ROCm/clr commit: 82b37fe48116aa6f122b0d2a2fc087fcb25f9a32] --- ...A_Driver_API_functions_supported_by_HIP.md | 123 +++++++++++-- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 169 +++++++++++------- 2 files changed, 221 insertions(+), 71 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index ad9d791a6d..d4b54438bb 100644 --- a/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -386,65 +386,168 @@ | define |`CUDA_ARRAY3D_TEXTURE_GATHER` | | This flag must be set in order to perform texture gather operations on a CUDA array. | | define |`CUDA_VERSION` | | CUDA API version number. | - ## **2. Error Handling** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuGetErrorName` | | Gets the string representation of an error code enum name. | +| `cuGetErrorString` | | Gets the string description of an error code. | ## **3. Initialization** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuInit` | `hipInit` | Initialize the CUDA driver API. | ## **4. Version Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuDriverGetVersion` | `hipDriverGetVersion` | Returns the CUDA driver version. | ## **5. Device Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| `cuDriverGetVersion` | `hipGetDevice` | Returns a handle to a compute device. | +| `cuDeviceGetAttribute` | `hipDeviceGetAttribute` | Returns information about the device. | +| `cuDeviceGetCount` | `hipGetDeviceCount` | Returns the number of compute-capable devices. | +| `cuDeviceGetName` | `hipDeviceGetName` | Returns an identifer string for the device. | +| `cuDeviceTotalMem` | `hipDeviceTotalMem` | Returns the total amount of memory on the device. | ## **6. Device Management [DEPRECATED]** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuDeviceComputeCapability` | `hipDeviceComputeCapability` | Returns the compute capability of the device. | +| `cuDeviceGetProperties` | `hipGetDeviceProperties` | Returns properties for a selected device. | ## **7. Primary Context Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuDevicePrimaryCtxGetState` | `hipDevicePrimaryCtxGetState` | Get the state of the primary context. | +| `cuDevicePrimaryCtxRelease` | `hipDevicePrimaryCtxRelease` | Release the primary context on the GPU. | +| `cuDevicePrimaryCtxReset` | `hipDevicePrimaryCtxReset` | Destroy all allocations and reset all state on the primary context. | +| `cuDevicePrimaryCtxRetain` | `hipDevicePrimaryCtxRetain` | Retain the primary context on the GPU. | +| `cuDevicePrimaryCtxSetFlags` | `hipDevicePrimaryCtxSetFlags` | Set flags for the primary context. | ## **8. Context Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuCtxCreate` | `hipCtxCreate` | Create a CUDA context. | +| `cuCtxDestroy` | `hipCtxDestroy` | Destroy a CUDA context. | +| `cuCtxGetApiVersion` | `hipCtxGetApiVersion` | Gets the context's API version. | +| `cuCtxGetCacheConfig` | `hipCtxGetCacheConfig` | Returns the preferred cache configuration for the current context. | +| `cuCtxGetCurrent` | `hipCtxGetCurrent` | Returns the CUDA context bound to the calling CPU thread. | +| `cuCtxGetDevice` | `hipCtxGetDevice` | Returns the device ID for the current context. | +| `cuCtxGetFlags` | `hipCtxGetFlags` | Returns the flags for the current context. | +| `cuCtxGetLimit` | | Returns resource limits. | +| `cuCtxGetSharedMemConfig` | `hipCtxGetSharedMemConfig` | Returns the current shared memory configuration for the current context. | +| `cuCtxGetStreamPriorityRange` | | Returns numerical values that correspond to the least and greatest stream priorities. | +| `cuCtxPopCurrent` | `hipCtxPopCurrent` | Pops the current CUDA context from the current CPU thread. | +| `cuCtxPushCurrent` | `hipCtxPushCurrent` | Pushes a context on the current CPU thread. | +| `cuCtxSetCacheConfig` | `hipCtxSetCacheConfig` | Sets the preferred cache configuration for the current context. | +| `cuCtxSetCurrent` | `hipCtxSetCurrent` | Binds the specified CUDA context to the calling CPU thread. | +| `cuCtxSetLimit` | | Set resource limits. | +| `cuCtxSetSharedMemConfig` | `hipCtxSetSharedMemConfig` | Sets the shared memory configuration for the current context. | +| `cuCtxSynchronize` | `hipCtxSynchronize` | Block for a context's tasks to complete. | ## **9. Context Management [DEPRECATED]** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuCtxAttach` | | Increment a context's usage-count. | +| `cuCtxDetach` | | Decrement a context's usage-count. | ## **10. Module Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuLinkAddData` | | Add an input to a pending linker invocation. | +| `cuLinkAddFile` | | Add a file input to a pending linker invocation. | +| `cuLinkComplete` | | Complete a pending linker invocation. | +| `cuLinkCreate` | | Creates a pending JIT linker invocation. | +| `cuLinkDestroy` | | Destroys state for a JIT linker invocation. | +| `cuModuleGetFunction` | `hipModuleGetFunction` | Returns a function handle. | +| `cuModuleGetGlobal` | `hipModuleGetGlobal` | Returns a global pointer from a module. | +| `cuModuleGetSurfRef` | | Returns a handle to a surface reference. | +| `cuModuleGetTexRef` | | Returns a handle to a texture reference. | +| `cuModuleLoad` | `hipModuleLoad` | Loads a compute module. | +| `cuModuleLoadData` | `hipModuleLoadData` | Load a module's data. | +| `cuModuleLoadDataEx` | `hipModuleLoadDataEx` | Load a module's data with options. | +| `cuModuleLoadFatBinary` | | Load a module's data. | +| `cuModuleUnload` | `hipModuleUnload` | Unloads a module. | ## **11. Memory Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuArray3DCreate` | | Creates a 3D CUDA array. | +| `cuArray3DGetDescriptor` | | Get a 3D CUDA array descriptor. | +| `cuArrayCreate` | | Creates a 1D or 2D CUDA array. | +| `cuArrayDestroy` | | Destroys a CUDA array. | +| `cuArrayGetDescriptor` | | Get a 1D or 2D CUDA array descriptor. | +| `cuDeviceGetByPCIBusId` | `hipDeviceGetByPCIBusId` | Returns a handle to a compute device. | +| `cuDeviceGetPCIBusId` | `hipDeviceGetPCIBusId` | Returns a PCI Bus Id string for the device. | +| `cuIpcCloseMemHandle` | | Close memory mapped with cuIpcOpenMemHandle. | +| `cuIpcGetEventHandle` | | Gets an interprocess handle for a previously allocated event. | +| `cuIpcGetMemHandle` | | Gets an interprocess memory handle for an existing device memory allocation. | +| `cuIpcOpenEventHandle` | | Opens an interprocess event handle for use in the current process. | +| `cuIpcOpenMemHandle` | | Opens an interprocess memory handle exported from another process and returns a device pointer usable in the local process. | +| `cuMemAlloc` | `hipMalloc` | Allocates device memory. | +| `cuMemAllocHost` | | Allocates page-locked host memory. | +| `cuMemAllocManaged` | | Allocates memory that will be automatically managed by the Unified Memory system. | +| `cuMemAllocPitch` | | Allocates pitched device memory. | +| `cuMemcpy` | | Copies memory. | +| `cuMemcpy2D` | | Copies memory for 2D arrays. | +| `cuMemcpy2DAsync` | | Copies memory for 2D arrays. | +| `cuMemcpy2DUnaligned` | | Copies memory for 2D arrays. | +| `cuMemcpy3D` | | Copies memory for 3D arrays. | +| `cuMemcpy3DAsync` | | Copies memory for 3D arrays. | +| `cuMemcpy3DPeer` | | Copies memory between contexts. | +| `cuMemcpy3DPeerAsync` | | Copies memory between contexts asynchronously. | +| `cuMemcpyAsync` | | Copies memory asynchronously. | +| `cuMemcpyAtoA` | | Copies memory from Array to Array. | +| `cuMemcpyAtoD` | | Copies memory from Array to Device. | +| `cuMemcpyAtoH` | | Copies memory from Array to Host. | +| `cuMemcpyAtoHAsync` | | Copies memory from Array to Host. | +| `cuMemcpyDtoA` | | Copies memory from Device to Array. | +| `cuMemcpyDtoD` | `hipMemcpyDtoD` | Copies memory from Device to Device. | +| `cuMemcpyDtoDAsync` | `hipMemcpyDtoDAsync` | Copies memory from Device to Device. | +| `cuMemcpyDtoH` | `hipMemcpyDtoH` | Copies memory from Device to Host. | +| `cuMemcpyDtoHAsync` | `hipMemcpyDtoHAsync` | Copies memory from Device to Host. | +| `cuMemcpyHtoA` | | Copies memory from Host to Array. | +| `cuMemcpyHtoAAsync` | | Copies memory from Host to Array. | +| `cuMemcpyHtoD` | `hipMemcpyHtoD` | Copies memory from Host to Device. | +| `cuMemcpyHtoDAsync` | `hipMemcpyHtoDAsync` | Copies memory from Host to Device. | +| `cuMemcpyPeer` | | Copies device memory between two contexts. | +| `cuMemcpyPeerAsync` | | Copies device memory between two contexts asynchronously. | +| `cuMemFree` | `hipFree` | Frees device memory. | +| `cuMemFreeHost` | `hipFreeHost` | Frees page-locked host memory. | +| `cuMemGetAddressRange` | | Get information on memory allocations. | +| `cuMemGetInfo` | `hipMemGetInfo` | Gets free and total memory. | +| `cuMemHostAlloc` | `hipHostMalloc` | Allocates page-locked host memory. | +| `cuMemHostGetDevicePointer` | | Passes back device pointer of mapped pinned memory. | +| `cuMemHostGetFlags` | | Passes back flags that were used for a pinned allocation. | +| `cuMemHostRegister` | `hipHostRegister` | Registers an existing host memory range for use by CUDA. | +| `cuMemHostUnregister` | `hipHostUnregister` | Unregisters a memory range that was registered with cuMemHostRegister. | +| `cuMemsetD16` | | Initializes device memory. | +| `cuMemsetD16Async` | | Sets device memory. | +| `cuMemsetD2D16` | | Initializes device memory. | +| `cuMemsetD2D16Async` | | Sets device memory. | +| `cuMemsetD2D32` | | Initializes device memory. | +| `cuMemsetD2D32Async` | | Sets device memory. | +| `cuMemsetD2D8` | | Initializes device memory. | +| `cuMemsetD2D8Async` | | Sets device memory. | +| `cuMemsetD32` | `hipMemset` | Initializes device memory. | +| `cuMemsetD32Async` | `hipMemsetAsync` | Sets device memory. | +| `cuMemsetD2D8` | | Initializes device memory. | +| `cuMemsetD2D8Async` | | Sets device memory. | +| `cuMipmappedArrayCreate` | | Creates a CUDA mipmapped array. | +| `cuMipmappedArrayDestroy` | | Destroys a CUDA mipmapped array. | +| `cuMipmappedArrayGetLevel` | | Gets a mipmap level of a CUDA mipmapped array. | ## **12. Unified Addressing** diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 0825285b51..de4da78451 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -391,7 +391,7 @@ struct cuda2hipMap { cuda2hipRename["cudaErrorUnknown"] = {"hipErrorUnknown", CONV_ERR, API_RUNTIME}; // 30 ///////////////////////////// CUDA DRIVER API ///////////////////////////// - // enums + // structs cuda2hipRename["CUDA_ARRAY3D_DESCRIPTOR"] = {"HIP_ARRAY3D_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CUDA_ARRAY_DESCRIPTOR"] = {"HIP_ARRAY_DESCRIPTOR", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["CUDA_MEMCPY2D"] = {"HIP_MEMCPY2D", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; @@ -453,16 +453,16 @@ struct cuda2hipMap { cuda2hipRename["CU_MEM_RANGE_ATTRIBUTE_LAST_PREFETCH_LOCATION"] = {"hipMemRangeAttributeLastPrefetchLocation", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 4 // API_RUNTIME ANALOGUE (cudaMemRangeAttributeLastPrefetchLocation = 4) // Context flags - cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 - cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 - cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 - cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 - cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 - cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 - cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 - cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f + cuda2hipRename["CUctx_flags"] = {"hipCctx_flags", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["CU_CTX_SCHED_AUTO"] = {"HIP_CTX_SCHED_AUTO", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x00 + cuda2hipRename["CU_CTX_SCHED_SPIN"] = {"HIP_CTX_SCHED_SPIN", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x01 + cuda2hipRename["CU_CTX_SCHED_YIELD"] = {"HIP_CTX_SCHED_YIELD", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x02 + cuda2hipRename["CU_CTX_SCHED_BLOCKING_SYNC"] = {"HIP_CTX_SCHED_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_BLOCKING_SYNC"] = {"HIP_CTX_BLOCKING_SYNC", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x04 + cuda2hipRename["CU_CTX_SCHED_MASK"] = {"HIP_CTX_SCHED_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x07 + cuda2hipRename["CU_CTX_MAP_HOST"] = {"HIP_CTX_MAP_HOST", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x08 + cuda2hipRename["CU_CTX_LMEM_RESIZE_TO_MAX"] = {"HIP_CTX_LMEM_RESIZE_TO_MAX", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x10 + cuda2hipRename["CU_CTX_FLAGS_MASK"] = {"HIP_CTX_FLAGS_MASK", CONV_TYPE, API_DRIVER, HIP_UNSUPPORTED}; // 0x1f // Defines cuda2hipRename["CU_LAUNCH_PARAM_BUFFER_POINTER"] = {"HIP_LAUNCH_PARAM_BUFFER_POINTER", CONV_DEV, API_DRIVER}; // ((void*)0x01) @@ -882,52 +882,79 @@ struct cuda2hipMap { cuda2hipRename["CU_STREAM_MEM_OP_WRITE_VALUE_32"] = {"hipStreamBatchMemOpWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 2 cuda2hipRename["CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES"] = {"hipStreamBatchMemOpFlushRemoteWrites", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // 3 + // Error Handling + cuda2hipRename["cuGetErrorName"] = {"hipGetErrorName___", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // cudaGetErrorName (hipGetErrorName) has different signature + cuda2hipRename["cuGetErrorString"] = {"hipGetErrorString___", CONV_ERR, API_DRIVER, HIP_UNSUPPORTED}; // cudaGetErrorString (hipGetErrorString) has different signature + // Init cuda2hipRename["cuInit"] = {"hipInit", CONV_DRIVER, API_DRIVER}; // Driver - cuda2hipRename["cuDriverGetVersion"] = {"hipDriverGetVersion", CONV_DRIVER, API_DRIVER}; + cuda2hipRename["cuDriverGetVersion"] = {"hipDriverGetVersion", CONV_DRIVER, API_DRIVER}; - // Context + // Context Management cuda2hipRename["cuCtxCreate_v2"] = {"hipCtxCreate", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuCtxDestroy_v2"] = {"hipCtxDestroy", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxPopCurrent_v2"] = {"hipCtxPopCurrent", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxPushCurrent_v2"] = {"hipCtxPushCurrent", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxSetCurrent"] = {"hipCtxSetCurrent", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxGetCurrent"] = {"hipCtxGetCurrent", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxGetDevice"] = {"hipCtxGetDevice", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuCtxGetApiVersion"] = {"hipCtxGetApiVersion", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuCtxGetCacheConfig"] = {"hipCtxGetCacheConfig", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxSetCacheConfig"] = {"hipCtxSetCacheConfig", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxSetSharedMemConfig"] = {"hipCtxSetSharedMemConfig", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxGetSharedMemConfig"] = {"hipCtxGetSharedMemConfig", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuCtxSynchronize"] = {"hipCtxSynchronize", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxGetCurrent"] = {"hipCtxGetCurrent", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxGetDevice"] = {"hipCtxGetDevice", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuCtxGetFlags"] = {"hipCtxGetFlags", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxGetLimit"] = {"hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuCtxGetSharedMemConfig"] = {"hipCtxGetSharedMemConfig", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxGetStreamPriorityRange"] = {"hipCtxGetStreamPriorityRange", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuCtxPopCurrent_v2"] = {"hipCtxPopCurrent", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxPushCurrent_v2"] = {"hipCtxPushCurrent", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxSetCacheConfig"] = {"hipCtxSetCacheConfig", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxSetCurrent"] = {"hipCtxSetCurrent", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxSetLimit"] = {"hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuCtxSetSharedMemConfig"] = {"hipCtxSetSharedMemConfig", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuCtxSynchronize"] = {"hipCtxSynchronize", CONV_CONTEXT, API_DRIVER}; + // Context Management [DEPRECATED] + cuda2hipRename["cuCtxAttach"] = {"hipCtxAttach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuCtxDetach"] = {"hipCtxDetach", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + + // Peer Context Memory Access cuda2hipRename["cuCtxEnablePeerAccess"] = {"hipCtxEnablePeerAccess", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuCtxDisablePeerAccess"] = {"hipCtxDisablePeerAccess", CONV_CONTEXT, API_DRIVER}; - // unsupported yet by HIP - cuda2hipRename["cuCtxSetLimit"] = {"hipCtxSetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuCtxGetLimit"] = {"hipCtxGetLimit", CONV_CONTEXT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuDeviceCanAccessPeer"] = {"hipDeviceCanAccessPeer", CONV_DEV, API_DRIVER}; + // Primary Context Management cuda2hipRename["cuDevicePrimaryCtxGetState"] = {"hipDevicePrimaryCtxGetState", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuDevicePrimaryCtxRelease"] = {"hipDevicePrimaryCtxRelease", CONV_CONTEXT, API_DRIVER}; - cuda2hipRename["cuDevicePrimaryCtxRetain"] = {"hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuDevicePrimaryCtxReset"] = {"hipDevicePrimaryCtxReset", CONV_CONTEXT, API_DRIVER}; + cuda2hipRename["cuDevicePrimaryCtxRetain"] = {"hipDevicePrimaryCtxRetain", CONV_CONTEXT, API_DRIVER}; cuda2hipRename["cuDevicePrimaryCtxSetFlags"] = {"hipDevicePrimaryCtxSetFlags", CONV_CONTEXT, API_DRIVER}; - // Device + // Device Management cuda2hipRename["cuDeviceGet"] = {"hipGetDevice", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetName"] = {"hipDeviceGetName", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetCount"] = {"hipGetDeviceCount", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetAttribute"] = {"hipDeviceGetAttribute", CONV_DEV, API_DRIVER}; - cuda2hipRename["cuDeviceGetProperties"] = {"hipGetDeviceProperties", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetPCIBusId"] = {"hipDeviceGetPCIBusId", CONV_DEV, API_DRIVER}; cuda2hipRename["cuDeviceGetByPCIBusId"] = {"hipDeviceGetByPCIBusId", CONV_DEV, API_DRIVER}; - cuda2hipRename["cuDeviceTotalMem_v2"] = {"hipDeviceTotalMem", CONV_DEV, API_DRIVER}; + + // Device Management [DEPRECATED] cuda2hipRename["cuDeviceComputeCapability"] = {"hipDeviceComputeCapability", CONV_DEV, API_DRIVER}; - cuda2hipRename["cuDeviceCanAccessPeer"] = {"hipDeviceCanAccessPeer", CONV_DEV, API_DRIVER}; + cuda2hipRename["cuDeviceGetProperties"] = {"hipGetDeviceProperties", CONV_DEV, API_DRIVER}; + + // Module Management + cuda2hipRename["cuLinkAddData"] = {"hipLinkAddData", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuLinkAddFile"] = {"hipLinkAddFile", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuLinkComplete"] = {"hipLinkComplete", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuLinkCreate"] = {"hipLinkCreate", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuLinkDestroy"] = {"hipLinkDestroy", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleGetFunction"] = {"hipModuleGetFunction", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuModuleGetGlobal_v2"] = {"hipModuleGetGlobal", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuModuleGetSurfRef"] = {"hipModuleGetSurfRef", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleGetTexRef"] = {"hipModuleGetTexRef", CONV_EVENT, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleLoad"] = {"hipModuleLoad", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuModuleLoadData"] = {"hipModuleLoadData", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuModuleLoadDataEx"] = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuModuleLoadFatBinary"] = {"hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuModuleUnload"] = {"hipModuleUnload", CONV_MODULE, API_DRIVER}; // unsupported yet by HIP [CUDA 8.0.44] // P2P Attributes @@ -960,16 +987,6 @@ struct cuda2hipMap { cuda2hipRename["cuEventRecord"] = {"hipEventRecord", CONV_EVENT, API_DRIVER}; cuda2hipRename["cuEventSynchronize"] = {"hipEventSynchronize", CONV_EVENT, API_DRIVER}; - // Module - cuda2hipRename["cuModuleGetFunction"] = {"hipModuleGetFunction", CONV_MODULE, API_DRIVER}; - cuda2hipRename["cuModuleGetGlobal_v2"] = {"hipModuleGetGlobal", CONV_MODULE, API_DRIVER}; - cuda2hipRename["cuModuleLoad"] = {"hipModuleLoad", CONV_MODULE, API_DRIVER}; - cuda2hipRename["cuModuleLoadData"] = {"hipModuleLoadData", CONV_MODULE, API_DRIVER}; - // unsupported yet by HIP - cuda2hipRename["cuModuleLoadDataEx"] = {"hipModuleLoadDataEx", CONV_MODULE, API_DRIVER}; - cuda2hipRename["cuModuleLoadFatBinary"] = {"hipModuleLoadFatBinary", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; - - cuda2hipRename["cuModuleUnload"] = {"hipModuleUnload", CONV_MODULE, API_DRIVER}; cuda2hipRename["cuLaunchKernel"] = {"hipModuleLaunchKernel", CONV_MODULE, API_DRIVER}; // Streams @@ -986,39 +1003,69 @@ struct cuda2hipMap { cuda2hipRename["cuStreamWaitEvent"] = {"hipStreamWaitEvent", CONV_STREAM, API_DRIVER}; // Memory management + cuda2hipRename["cuArray3DCreate"] = {"hipArray3DCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuArray3DGetDescriptor"] = {"hipArray3DGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuArrayCreate"] = {"hipArrayCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuArrayDestroy"] = {"hipArrayDestroy", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuArrayGetDescriptor"] = {"hipArrayGetDescriptor", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuIpcCloseMemHandle"] = {"hipIpcCloseMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuIpcGetEventHandle"] = {"hipIpcGetEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuIpcGetMemHandle"] = {"hipIpcGetMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuIpcOpenEventHandle"] = {"hipIpcOpenEventHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuIpcOpenMemHandle"] = {"hipIpcOpenMemHandle", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemAlloc_v2"] = {"hipMalloc", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemFree_v2"] = {"hipFree", CONV_MEM, API_DRIVER}; - - cuda2hipRename["cuMemHostAlloc"] = {"hipHostMalloc", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemFreeHost"] = {"hipHostFree", CONV_MEM, API_DRIVER}; - + cuda2hipRename["cuMemAllocHost"] = {"hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemAllocManaged"] = {"hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemAllocPitch"] = {"hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemAllocPitch due to different signatures + cuda2hipRename["cuMemcpy"] = {"hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy due to different signatures + cuda2hipRename["cuMemcpy2D"] = {"hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2D due to different signatures + cuda2hipRename["cuMemcpy2DAsync"] = {"hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2DAsync due to different signatures + cuda2hipRename["cuMemcpy2DUnaligned"] = {"hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpy3D"] = {"hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3D due to different signatures + cuda2hipRename["cuMemcpy3DAsync"] = {"hipMemcpy3DAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DAsync due to different signatures + cuda2hipRename["cuMemcpy3DPeer"] = {"hipMemcpy3DPeer__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeer due to different signatures + cuda2hipRename["cuMemcpy3DPeerAsync"] = {"hipMemcpy3DPeerAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeerAsync due to different signatures + cuda2hipRename["cuMemcpyAsync"] = {"hipMemcpyAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyAsync due to different signatures + cuda2hipRename["cuMemcpyAtoA"] = {"hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyAtoD"] = {"hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyAtoH"] = {"hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyAtoHAsync"] = {"hipMemcpyAtoHAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyDtoA"] = {"hipMemcpyDtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemcpyDtoD_v2"] = {"hipMemcpyDtoD", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemcpyDtoDAsync_v2"] = {"hipMemcpyDtoDAsync", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemcpyDtoH_v2"] = {"hipMemcpyDtoH", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemcpyDtoHAsync_v2"] = {"hipMemcpyDtoHAsync", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemcpyHtoA"] = {"hipMemcpyHtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyHtoAAsync"] = {"hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemcpyHtoD_v2"] = {"hipMemcpyHtoD", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemcpyHtoDAsync_v2"] = {"hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER}; - - // unsupported yet by HIP - cuda2hipRename["cuMemsetD8_v2"] = {"hipMemsetD8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemsetD8Async"] = {"hipMemsetD8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemsetD2D8_v2"] = {"hipMemsetD2D8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemsetD2D8Async"] = {"hipMemsetD2D8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemcpyPeerAsync"] = {"hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeerAsync due to different signatures + cuda2hipRename["cuMemcpyPeer"] = {"hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeer due to different signatures + cuda2hipRename["cuMemFree_v2"] = {"hipFree", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemFreeHost"] = {"hipHostFree", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemGetAddressRange"] = {"hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemGetInfo_v2"] = {"hipMemGetInfo", CONV_MEM, API_DRIVER}; + cuda2hipRename["cuMemHostAlloc"] = {"hipHostMalloc", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) + cuda2hipRename["cuMemHostGetDevicePointer"] = {"hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemHostGetFlags"] = {"hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) + cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostUnregister) cuda2hipRename["cuMemsetD16_v2"] = {"hipMemsetD16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD16Async"] = {"hipMemsetD16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D16_v2"] = {"hipMemsetD2D16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D16Async"] = {"hipMemsetD2D16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - - cuda2hipRename["cuMemsetD32_v2"] = {"hipMemset", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemsetD32Async"] = {"hipMemsetAsync", CONV_MEM, API_DRIVER}; - // unsupported yet by HIP cuda2hipRename["cuMemsetD2D32_v2"] = {"hipMemsetD2D32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D32Async"] = {"hipMemsetD2D32Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - - cuda2hipRename["cuMemGetInfo_v2"] = {"hipMemGetInfo", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; - + cuda2hipRename["cuMemsetD2D8_v2"] = {"hipMemsetD2D8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemsetD2D8Async"] = {"hipMemsetD2D8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemsetD32_v2"] = {"hipMemset", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemset) + cuda2hipRename["cuMemsetD32Async"] = {"hipMemsetAsync", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemsetAsync) + cuda2hipRename["cuMemsetD8_v2"] = {"hipMemsetD8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMemsetD8Async"] = {"hipMemsetD8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMipmappedArrayCreate"] = {"hipMipmappedArrayCreate", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMipmappedArrayDestroy"] = {"hipMipmappedArrayDestroy", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuMipmappedArrayGetLevel"] = {"hipMipmappedArrayGetLevel", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + // unsupported yet by HIP [CUDA 8.0.44] cuda2hipRename["cuMemPrefetchAsync"] = {"hipMemPrefetchAsync___", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature) cuda2hipRename["cuMemAdvise"] = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemAdvise) cuda2hipRename["cuMemRangeGetAttribute"] = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttribute) @@ -1298,7 +1345,7 @@ struct cuda2hipMap { // Attributes cuda2hipRename["cudaDeviceGetAttribute"] = {"hipDeviceGetAttribute", CONV_DEV, API_RUNTIME}; - cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME}; // API_DRIVER ANALOGUE (CUdevice_attribute) + cuda2hipRename["cudaDeviceAttr"] = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME}; // API_DRIVER ANALOGUE (CUdevice_attribute) cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"] = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME}; // 1 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK = 1) cuda2hipRename["cudaDevAttrMaxBlockDimX"] = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME}; // 2 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X = 2) cuda2hipRename["cudaDevAttrMaxBlockDimY"] = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME}; // 3 // API_DRIVER ANALOGUE (CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y = 3) From b237b831014f67424ea82be6311506b4dbcce89c Mon Sep 17 00:00:00 2001 From: Sandeep Kumar Date: Mon, 12 Jun 2017 17:14:12 +0530 Subject: [PATCH 148/171] Add peer2peer bandwidth and latency test Change-Id: I6d88e4aa9f6e64096af16579eebef4740734203e [ROCm/clr commit: 01842faa1cc15d6ae1f4bd88a91b2b98b6d6a750] --- .../hipBusBandwidth/hipBusBandwidth.cpp | 395 +++++++++++++++++- 1 file changed, 372 insertions(+), 23 deletions(-) diff --git a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp index 09f78543c9..b3b0b3e4a6 100644 --- a/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp +++ b/projects/clr/hipamd/samples/1_Utils/hipBusBandwidth/hipBusBandwidth.cpp @@ -16,13 +16,15 @@ int p_iterations = 10; int p_beatsperiteration=1; int p_device = 0; int p_detailed = 0; -bool p_async = 0; +bool p_async = 0; int p_alignedhost = 0; // align host allocs to this granularity, in bytes. 64 or 4096 are good values to try. -int p_onesize = 0; +int p_onesize = 0; bool p_h2d = true; bool p_d2h = true; bool p_bidir = true; +bool p_p2p = false; + //#define NO_CHECK @@ -70,7 +72,7 @@ std::string sizeToString(int size) // **************************************************************************** -hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind) +hipError_t memcopy(void * dst, const void *src, size_t sizeBytes, enum hipMemcpyKind kind ) { if (p_async) { return hipMemcpyAsync(dst, src, sizeBytes, kind, NULL); @@ -632,6 +634,9 @@ void RunBenchmark_Bidir(ResultDatabase &resultDB) } + + + #define failed(...) \ printf ("error: ");\ printf (__VA_ARGS__);\ @@ -646,6 +651,326 @@ int parseInt(const char *str, int *output) } +void checkPeer2PeerSupport() +{ + int deviceCnt; + hipGetDeviceCount(&deviceCnt); + std::cout << "Total no. of available gpu #" << deviceCnt << "\n" << std::endl; + + for(int deviceId=0; deviceIdhost then host-->GPU2)\n\n" << std::endl; +} + +void enablePeer2Peer(int currentGpu, int peerGpu) +{ + int canAccessPeer; + + hipSetDevice(currentGpu); + hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu); + + if(canAccessPeer==1){ + hipDeviceEnablePeerAccess(peerGpu, 0); + } +} + +void disablePeer2Peer(int currentGpu, int peerGpu) +{ + int canAccessPeer; + + hipSetDevice(currentGpu); + hipDeviceCanAccessPeer(&canAccessPeer, currentGpu, peerGpu); + + if(canAccessPeer==1){ + hipDeviceDisablePeerAccess(peerGpu); + } +} + +std::string gpuIDToString(int gpuID) +{ + using namespace std; + stringstream ss; + ss << gpuID; + return ss.str(); +} + +void RunBenchmark_P2P_Unidir(ResultDatabase &resultDB) +{ + int gpuCount; + hipGetDeviceCount(&gpuCount); + + int currentGpu, peerGpu; + + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + for (currentGpu=0; currentGpu1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); + + resultDB.AddResult(std::string("p2p_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("P2P_uni") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t); + + if (p_onesize) { + break; + } + } + + } + + if (p_onesize) { + numMaxFloats = sizeToBytes(p_onesize) / sizeof(float); + } + + disablePeer2Peer(currentGpu, peerGpu); + + hipEventDestroy(start); + hipEventDestroy(stop); + + // Cleanup + hipFree((void*)currentGpuMem); + hipFree((void*)peerGpuMem); + CHECK_HIP_ERROR(); + + hipSetDevice(peerGpu); + hipDeviceReset(); + + hipSetDevice(currentGpu); + hipDeviceReset(); + } + + } + +} + +void RunBenchmark_P2P_Bidir(ResultDatabase &resultDB) { + + int gpuCount; + hipGetDeviceCount(&gpuCount); + + hipStream_t stream[2]; + + int currentGpu, peerGpu; + + long long numMaxFloats = 1024 * (sizes[nSizes-1]) / 4; + + for (currentGpu=0; currentGpu1) { + sprintf(sizeStr, "%9sx%d", sizeToString(thisSize).c_str(), p_beatsperiteration); + } else { + sprintf(sizeStr, "%9s", sizeToString(thisSize).c_str()); + } + + string cGpu, pGpu; + cGpu = gpuIDToString(currentGpu); + pGpu = gpuIDToString(peerGpu); + + resultDB.AddResult(std::string("p2p_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "GB/sec", speed); + resultDB.AddResult(std::string("P2P_bi") + "_gpu" + std::string(cGpu)+ "_gpu" + std::string(pGpu), sizeStr, "ms", t); + + if (p_onesize) { + break; + } + } + + } + + if (p_onesize) { + numMaxFloats = sizeToBytes(p_onesize) / sizeof(float); + } + + disablePeer2Peer(currentGpu, peerGpu); + + hipEventDestroy(start); + hipEventDestroy(stop); + + for (int i=0; i<2; i++) { + hipStreamDestroy(stream[i]); + + hipFree((void*)currentGpuMem[i]); + hipFree((void*)peerGpuMem[i]); + CHECK_HIP_ERROR(); + } + + hipSetDevice(peerGpu); + hipDeviceReset(); + + hipSetDevice(currentGpu); + hipDeviceReset(); + } + } +} + + void printConfig() { hipDeviceProp_t props; hipGetDeviceProperties(&props, p_device); @@ -662,9 +987,9 @@ void help() { printf (" --d2h : Run only device-to-host test.\n"); printf (" --h2d : Run only host-to-device test.\n"); printf (" --bidir : Run only bidir copy test.\n"); + printf (" --p2p : Run only peer2peer unidir and bidir copy tests.\n"); printf (" --verbose : Print verbose status messages as test is run.\n"); printf (" --detailed : Print detailed report (including all trials).\n"); - printf (" --async : Use hipMemcpyAsync(with NULL stream) for H2D/D2H. Default uses hipMemcpy.\n"); printf (" --onesize, -o : Only run one measurement, at specified size (in KB, or if negative in bytes)\n"); @@ -712,6 +1037,12 @@ int parseStandardArguments(int argc, char *argv[]) p_d2h = false; p_bidir = true; + } else if (!strcmp(arg, "--p2p")) { + p_h2d = false; + p_d2h = false; + p_bidir = false; + p_p2p = true; + } else if (!strcmp(arg, "--help") || (!strcmp(arg, "-h"))) { help(); exit(EXIT_SUCCESS); @@ -737,39 +1068,57 @@ int main(int argc, char *argv[]) { parseStandardArguments(argc, argv); - printConfig(); + if (p_p2p) { + checkPeer2PeerSupport(); - if (p_h2d) { - ResultDatabase resultDB; - RunBenchmark_H2D(resultDB); + ResultDatabase resultDB_Unidir, resultDB_Bidir; - resultDB.DumpSummary(std::cout); + RunBenchmark_P2P_Unidir(resultDB_Unidir); + RunBenchmark_P2P_Bidir(resultDB_Bidir); + + resultDB_Unidir.DumpSummary(std::cout); + resultDB_Bidir.DumpSummary(std::cout); if (p_detailed) { - resultDB.DumpDetailed(std::cout); + resultDB_Unidir.DumpDetailed(std::cout); + resultDB_Bidir.DumpDetailed(std::cout); } } + else { + printConfig(); - if (p_d2h) { - ResultDatabase resultDB; - RunBenchmark_D2H(resultDB); + if (p_h2d) { + ResultDatabase resultDB; + RunBenchmark_H2D(resultDB); - resultDB.DumpSummary(std::cout); + resultDB.DumpSummary(std::cout); - if (p_detailed) { - resultDB.DumpDetailed(std::cout); + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } + } + + if (p_d2h) { + ResultDatabase resultDB; + RunBenchmark_D2H(resultDB); + + resultDB.DumpSummary(std::cout); + + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } } - } - if (p_bidir) { - ResultDatabase resultDB; - RunBenchmark_Bidir(resultDB); + if (p_bidir) { + ResultDatabase resultDB; + RunBenchmark_Bidir(resultDB); - resultDB.DumpSummary(std::cout); + resultDB.DumpSummary(std::cout); - if (p_detailed) { - resultDB.DumpDetailed(std::cout); + if (p_detailed) { + resultDB.DumpDetailed(std::cout); + } } } } From 86f2c1e395c0ed35e978c6b531edfd2904e17a5f Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 14 Jun 2017 09:45:46 +0530 Subject: [PATCH 149/171] Bump HIP base version to 1.2 Change-Id: I8ecc164afed4383f78579ed86a5c8c11a73b0780 [ROCm/clr commit: 4fa2090b78dde2de636ffb89ab7cebdc58849f9a] --- projects/clr/hipamd/bin/hipconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/bin/hipconfig b/projects/clr/hipamd/bin/hipconfig index 663a1e14cd..39fdab5a99 100755 --- a/projects/clr/hipamd/bin/hipconfig +++ b/projects/clr/hipamd/bin/hipconfig @@ -1,7 +1,7 @@ #!/usr/bin/perl -w $HIP_BASE_VERSION_MAJOR = "1"; -$HIP_BASE_VERSION_MINOR = "0"; +$HIP_BASE_VERSION_MINOR = "2"; # Need perl > 5.10 to use logic-defined or use 5.006; use v5.10.1; From 389f1d6aa87af8caaed16d3d0def9bd5c58bafcf Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Wed, 14 Jun 2017 15:18:57 +0530 Subject: [PATCH 150/171] Validity check of input arguments in Ipc Mem APIs Change-Id: Ia48e949d19f354f10c7e44cc2457fd4154bf6d76 [ROCm/clr commit: 85708089d1ec0d0b7f78ac3a0a8ae0e3eebd3bfb] --- projects/clr/hipamd/src/hip_memory.cpp | 88 ++++++++++++++------------ 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index c04c2611c3..ce65579e34 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -1275,70 +1275,78 @@ hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr){ // Get the size of allocated pointer size_t psize; hc::accelerator acc; - hc::AmPointerInfo amPointerInfo( NULL , NULL , 0 , acc , 0 , 0 ); - am_status_t status = hc::am_memtracker_getinfo( &amPointerInfo , devPtr ); - if (status == AM_SUCCESS) { - psize = (size_t)amPointerInfo._sizeBytes; - } - else + if((handle == NULL) || (devPtr == NULL)) { hipStatus = hipErrorInvalidResourceHandle; - ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle; - // Save the size of the pointer to hipIpcMemHandle - iHandle->psize = psize; + } else { + hc::AmPointerInfo amPointerInfo( NULL , NULL , 0 , acc , 0 , 0 ); + am_status_t status = hc::am_memtracker_getinfo( &amPointerInfo , devPtr ); + if (status == AM_SUCCESS) { + psize = (size_t)amPointerInfo._sizeBytes; + } else + hipStatus = hipErrorInvalidResourceHandle; + ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) handle; + // Save the size of the pointer to hipIpcMemHandle + iHandle->psize = psize; #if USE_IPC - // Create HSA ipc memory - hsa_status_t hsa_status = - hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle)); - if(hsa_status!= HSA_STATUS_SUCCESS) - hipStatus = hipErrorMemoryAllocation; + // Create HSA ipc memory + hsa_status_t hsa_status = + hsa_amd_ipc_memory_create(devPtr, psize, (hsa_amd_ipc_memory_t*) &(iHandle->ipc_handle)); + if(hsa_status!= HSA_STATUS_SUCCESS) + hipStatus = hipErrorMemoryAllocation; #else - hipStatus = hipErrorRuntimeOther; + hipStatus = hipErrorRuntimeOther; #endif - + } return ihipLogStatus(hipStatus); } hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned int flags){ HIP_INIT_API ( devPtr, &handle , flags); hipError_t hipStatus = hipSuccess; - + if(devPtr == NULL) { + hipStatus = hipErrorInvalidValue; + } else { #if USE_IPC - // Get the current device agent. - hc::accelerator acc; - hsa_agent_t *agent = static_cast(acc.get_hsa_agent()); - if(!agent) - return hipErrorInvalidResourceHandle; + // Get the current device agent. + hc::accelerator acc; + hsa_agent_t *agent = static_cast(acc.get_hsa_agent()); + if(!agent) + return hipErrorInvalidResourceHandle; - ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle; - //Attach ipc memory - auto ctx= ihipGetTlsDefaultCtx(); - { - LockedAccessor_CtxCrit_t crit(ctx->criticalData()); - // the peerCnt always stores self so make sure the trace actually - hsa_status_t hsa_status = - hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr); - if(hsa_status != HSA_STATUS_SUCCESS) - hipStatus = hipErrorMapBufferObjectFailed; - } + ihipIpcMemHandle_t* iHandle = (ihipIpcMemHandle_t*) &handle; + //Attach ipc memory + auto ctx= ihipGetTlsDefaultCtx(); + { + LockedAccessor_CtxCrit_t crit(ctx->criticalData()); + // the peerCnt always stores self so make sure the trace actually + hsa_status_t hsa_status = + hsa_amd_ipc_memory_attach((hsa_amd_ipc_memory_t*)&(iHandle->ipc_handle), iHandle->psize, crit->peerCnt(), crit->peerAgents(), devPtr); + if(hsa_status != HSA_STATUS_SUCCESS) + hipStatus = hipErrorMapBufferObjectFailed; + } #else - hipStatus = hipErrorRuntimeOther; + hipStatus = hipErrorRuntimeOther; #endif + } return ihipLogStatus(hipStatus); } hipError_t hipIpcCloseMemHandle(void *devPtr){ HIP_INIT_API ( devPtr ); hipError_t hipStatus = hipSuccess; - + if(devPtr == NULL) { + hipStatus = hipErrorInvalidValue; + } else { #if USE_IPC - hsa_status_t hsa_status = - hsa_amd_ipc_memory_detach(devPtr); - if(hsa_status != HSA_STATUS_SUCCESS) - return hipErrorInvalidResourceHandle; + hsa_status_t hsa_status = + hsa_amd_ipc_memory_detach(devPtr); + if(hsa_status != HSA_STATUS_SUCCESS) + return hipErrorInvalidResourceHandle; #else - hipStatus = hipErrorRuntimeOther; + hipStatus = hipErrorRuntimeOther; #endif + } return ihipLogStatus(hipStatus); } From a1a23c69d3ca5fa44009453b33aa0eb166fd1313 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Wed, 14 Jun 2017 11:10:52 -0500 Subject: [PATCH 151/171] Additional GGL make_kernel_functor_* macros, contributed by Alex Change-Id: I01aabb7d2b5418fcefb1bbf78eb5d1888dbc5c96 [ROCm/clr commit: 9bce2af76f2b7f3a1322071752bc9b804029174f] --- .../hip/hcc_detail/grid_launch_GGL.hpp | 122 ++++++++++++++++++ 1 file changed, 122 insertions(+) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp b/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp index 8e3dab8482..eac48b595e 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp +++ b/projects/clr/hipamd/include/hip/hcc_detail/grid_launch_GGL.hpp @@ -245,6 +245,128 @@ namespace hip_impl HIP_kernel_functor_name_begin ## _ ## k ## _ ## \ HIP_kernel_functor_name_end ## _ ## n + #define make_kernel_functor_hip_30(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25, p26, p27)\ + struct make_kernel_name_hip(function_name, 28) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + std::decay_t _p26_;\ + std::decay_t _p27_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ + _p26_, _p27_);\ + }\ + } + #define make_kernel_functor_hip_29(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25, p26)\ + struct make_kernel_name_hip(function_name, 27) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + std::decay_t _p26_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_,\ + _p26_);\ + }\ + } + #define make_kernel_functor_hip_28(\ + function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ + p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ + p24, p25)\ + struct make_kernel_name_hip(function_name, 26) {\ + std::decay_t _p0_;\ + std::decay_t _p1_;\ + std::decay_t _p2_;\ + std::decay_t _p3_;\ + std::decay_t _p4_;\ + std::decay_t _p5_;\ + std::decay_t _p6_;\ + std::decay_t _p7_;\ + std::decay_t _p8_;\ + std::decay_t _p9_;\ + std::decay_t _p10_;\ + std::decay_t _p11_;\ + std::decay_t _p12_;\ + std::decay_t _p13_;\ + std::decay_t _p14_;\ + std::decay_t _p15_;\ + std::decay_t _p16_;\ + std::decay_t _p17_;\ + std::decay_t _p18_;\ + std::decay_t _p19_;\ + std::decay_t _p20_;\ + std::decay_t _p21_;\ + std::decay_t _p22_;\ + std::decay_t _p23_;\ + std::decay_t _p24_;\ + std::decay_t _p25_;\ + void operator()(const hc::tiled_index<3>&) const [[hc]]\ + {\ + kernel_name(\ + _p0_, _p1_, _p2_, _p3_, _p4_, _p5_, _p6_, _p7_, _p8_, _p9_,\ + _p10_, _p11_, _p12_, _p13_, _p14_, _p15_, _p16_, _p17_,\ + _p18_, _p19_, _p20_, _p21_, _p22_, _p23_, _p24_, _p25_);\ + }\ + } #define make_kernel_functor_hip_27(\ function_name, kernel_name, p0, p1, p2, p3, p4, p5, p6, p7, p8, p9,\ p10, p11, p12, p13, p14, p15, p16, p17, p18, p19, p20, p21, p22, p23,\ From 1912c9f3088537af3d2a681afc03be0f328fe09f Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 14 Jun 2017 19:55:55 +0300 Subject: [PATCH 152/171] [HIPIFY] Sync HIPIFY with HIP by CUDA Driver API functions. + 4.12. Unified Addressing + 4.13. Stream Management ToDo: 4.14 - 4.31 modules of CUDA Driver API. [ROCm/clr commit: 094f1b1f2f50657f290ec3c8c6461efcf6f3e3f3] --- ...A_Driver_API_functions_supported_by_HIP.md | 22 +++++++- .../clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 54 ++++++++++--------- 2 files changed, 50 insertions(+), 26 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index d4b54438bb..0b3bb540bf 100644 --- a/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -553,13 +553,31 @@ | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuMemAdvise` | | Advise about the usage of a given memory range. | +| `cuMemPrefetchAsync` | | Prefetches memory to the specified destination device. | +| `cuMemRangeGetAttribute` | | Query an attribute of a given memory range. | +| `cuMemRangeGetAttributes` | | Query attributes of a given memory range. | +| `cuPointerGetAttribute` | | Returns information about a pointer. | +| `cuPointerGetAttributes` | | Returns information about a pointer. | +| `cuPointerSetAttribute` | | Set attributes on a previously allocated memory region. | ## **13. Stream Management** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuStreamAddCallback` | | Add a callback to a compute stream. | +| `cuStreamAttachMemAsync` | | Attach memory to a stream asynchronously. | +| `cuStreamCreate` | | Create a stream. | +| `cuStreamCreateWithPriority` | | Create a stream with the given priority. | +| `cuStreamDestroy` | `hipStreamDestroy` | Destroys a stream. | +| `cuStreamGetFlags` | `hipStreamGetFlags` | Query the flags of a given stream. | +| `cuStreamGetPriority` | `hipStreamGetPriority` | Query the priority of a given stream. | +| `cuStreamQuery` | `hipStreamQuery` | Determine status of a compute stream. | +| `cuStreamSynchronize` | `hipStreamSynchronize` | Wait until a stream's tasks are completed. | +| `cuStreamWaitEvent` | `hipStreamWaitEvent` | Make a compute stream wait on an event. | +| `cuStreamBatchMemOp` | | Batch operations to synchronize the stream via memory operations. | +| `cuStreamWaitValue32` | | Wait on a memory location. | +| `cuStreamWriteValue32` | | Write a value to memory. | ## **14. Event Management** diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index de4da78451..7f9fefa7f9 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -990,17 +990,19 @@ struct cuda2hipMap { cuda2hipRename["cuLaunchKernel"] = {"hipModuleLaunchKernel", CONV_MODULE, API_DRIVER}; // Streams - // unsupported yet by HIP cuda2hipRename["cuStreamAddCallback"] = {"hipStreamAddCallback", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuStreamWaitValue32"] = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE - cuda2hipRename["cuStreamWriteValue32"] = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE - cuda2hipRename["cuStreamBatchMemOp"] = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE - - cuda2hipRename["cuStreamCreate"] = {"hipStreamCreate", CONV_STREAM, API_DRIVER}; + cuda2hipRename["cuStreamAttachMemAsync"] = {"hipStreamAttachMemAsync", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuStreamCreate"] = {"hipStreamCreate__", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaStreamCreate due to different signatures + cuda2hipRename["cuStreamCreateWithPriority"] = {"hipStreamCreateWithPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuStreamDestroy_v2"] = {"hipStreamDestroy", CONV_STREAM, API_DRIVER}; + cuda2hipRename["cuStreamGetFlags"] = {"hipStreamGetFlags", CONV_STREAM, API_DRIVER}; + cuda2hipRename["cuStreamGetPriority"] = {"hipStreamGetPriority", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuStreamQuery"] = {"hipStreamQuery", CONV_STREAM, API_DRIVER}; cuda2hipRename["cuStreamSynchronize"] = {"hipStreamSynchronize", CONV_STREAM, API_DRIVER}; cuda2hipRename["cuStreamWaitEvent"] = {"hipStreamWaitEvent", CONV_STREAM, API_DRIVER}; + cuda2hipRename["cuStreamWaitValue32"] = {"hipStreamWaitValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuStreamWriteValue32"] = {"hipStreamWriteValue32", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE + cuda2hipRename["cuStreamBatchMemOp"] = {"hipStreamBatchMemOp", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE // Memory management cuda2hipRename["cuArray3DCreate"] = {"hipArray3DCreate", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; @@ -1016,16 +1018,16 @@ struct cuda2hipMap { cuda2hipRename["cuMemAlloc_v2"] = {"hipMalloc", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemAllocHost"] = {"hipMemAllocHost", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemAllocManaged"] = {"hipMemAllocManaged", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemAllocPitch"] = {"hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemAllocPitch due to different signatures - cuda2hipRename["cuMemcpy"] = {"hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy due to different signatures - cuda2hipRename["cuMemcpy2D"] = {"hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2D due to different signatures - cuda2hipRename["cuMemcpy2DAsync"] = {"hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2DAsync due to different signatures + cuda2hipRename["cuMemAllocPitch"] = {"hipMemAllocPitch__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemAllocPitch due to different signatures + cuda2hipRename["cuMemcpy"] = {"hipMemcpy__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy due to different signatures + cuda2hipRename["cuMemcpy2D"] = {"hipMemcpy2D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2D due to different signatures + cuda2hipRename["cuMemcpy2DAsync"] = {"hipMemcpy2DAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy2DAsync due to different signatures cuda2hipRename["cuMemcpy2DUnaligned"] = {"hipMemcpy2DUnaligned", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemcpy3D"] = {"hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3D due to different signatures - cuda2hipRename["cuMemcpy3DAsync"] = {"hipMemcpy3DAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DAsync due to different signatures - cuda2hipRename["cuMemcpy3DPeer"] = {"hipMemcpy3DPeer__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeer due to different signatures - cuda2hipRename["cuMemcpy3DPeerAsync"] = {"hipMemcpy3DPeerAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeerAsync due to different signatures - cuda2hipRename["cuMemcpyAsync"] = {"hipMemcpyAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyAsync due to different signatures + cuda2hipRename["cuMemcpy3D"] = {"hipMemcpy3D__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3D due to different signatures + cuda2hipRename["cuMemcpy3DAsync"] = {"hipMemcpy3DAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DAsync due to different signatures + cuda2hipRename["cuMemcpy3DPeer"] = {"hipMemcpy3DPeer__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeer due to different signatures + cuda2hipRename["cuMemcpy3DPeerAsync"] = {"hipMemcpy3DPeerAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpy3DPeerAsync due to different signatures + cuda2hipRename["cuMemcpyAsync"] = {"hipMemcpyAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyAsync due to different signatures cuda2hipRename["cuMemcpyAtoA"] = {"hipMemcpyAtoA", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemcpyAtoD"] = {"hipMemcpyAtoD", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemcpyAtoH"] = {"hipMemcpyAtoH", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; @@ -1039,17 +1041,17 @@ struct cuda2hipMap { cuda2hipRename["cuMemcpyHtoAAsync"] = {"hipMemcpyHtoAAsync", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemcpyHtoD_v2"] = {"hipMemcpyHtoD", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemcpyHtoDAsync_v2"] = {"hipMemcpyHtoDAsync", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemcpyPeerAsync"] = {"hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeerAsync due to different signatures - cuda2hipRename["cuMemcpyPeer"] = {"hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeer due to different signatures + cuda2hipRename["cuMemcpyPeerAsync"] = {"hipMemcpyPeerAsync__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeerAsync due to different signatures + cuda2hipRename["cuMemcpyPeer"] = {"hipMemcpyPeer__", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; // Not equal to cudaMemcpyPeer due to different signatures cuda2hipRename["cuMemFree_v2"] = {"hipFree", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemFreeHost"] = {"hipHostFree", CONV_MEM, API_DRIVER}; cuda2hipRename["cuMemGetAddressRange"] = {"hipMemGetAddressRange", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemGetInfo_v2"] = {"hipMemGetInfo", CONV_MEM, API_DRIVER}; - cuda2hipRename["cuMemHostAlloc"] = {"hipHostMalloc", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) + cuda2hipRename["cuMemHostAlloc"] = {"hipHostMalloc", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) cuda2hipRename["cuMemHostGetDevicePointer"] = {"hipMemHostGetDevicePointer", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemHostGetFlags"] = {"hipMemHostGetFlags", CONV_MEM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) - cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostUnregister) + cuda2hipRename["cuMemHostRegister_v2"] = {"hipHostRegister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostAlloc) + cuda2hipRename["cuMemHostUnregister"] = {"hipHostUnregister", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaHostUnregister) cuda2hipRename["cuMemsetD16_v2"] = {"hipMemsetD16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD16Async"] = {"hipMemsetD16Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D16_v2"] = {"hipMemsetD2D16", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; @@ -1058,18 +1060,22 @@ struct cuda2hipMap { cuda2hipRename["cuMemsetD2D32Async"] = {"hipMemsetD2D32Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D8_v2"] = {"hipMemsetD2D8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD2D8Async"] = {"hipMemsetD2D8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - cuda2hipRename["cuMemsetD32_v2"] = {"hipMemset", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemset) - cuda2hipRename["cuMemsetD32Async"] = {"hipMemsetAsync", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemsetAsync) + cuda2hipRename["cuMemsetD32_v2"] = {"hipMemset", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemset) + cuda2hipRename["cuMemsetD32Async"] = {"hipMemsetAsync", CONV_MEM, API_DRIVER}; // API_Runtime ANALOGUE (cudaMemsetAsync) cuda2hipRename["cuMemsetD8_v2"] = {"hipMemsetD8", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMemsetD8Async"] = {"hipMemsetD8Async", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMipmappedArrayCreate"] = {"hipMipmappedArrayCreate", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMipmappedArrayDestroy"] = {"hipMipmappedArrayDestroy", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuMipmappedArrayGetLevel"] = {"hipMipmappedArrayGetLevel", CONV_STREAM, API_DRIVER, HIP_UNSUPPORTED}; - // unsupported yet by HIP [CUDA 8.0.44] - cuda2hipRename["cuMemPrefetchAsync"] = {"hipMemPrefetchAsync___", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature) + + // Unified Addressing + cuda2hipRename["cuMemPrefetchAsync"] = {"hipMemPrefetchAsync__", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // no API_Runtime ANALOGUE (cudaMemPrefetchAsync has different signature) cuda2hipRename["cuMemAdvise"] = {"hipMemAdvise", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemAdvise) cuda2hipRename["cuMemRangeGetAttribute"] = {"hipMemRangeGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttribute) cuda2hipRename["cuMemRangeGetAttributes"] = {"hipMemRangeGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // [CUDA 8.0.44] // API_Runtime ANALOGUE (cudaMemRangeGetAttributes) + cuda2hipRename["cuPointerGetAttribute"] = {"hipPointerGetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cuPointerGetAttributes"] = {"hipPointerGetAttributes", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; + cuda2hipRename["cuPointerSetAttribute"] = {"hipPointerSetAttribute", CONV_MEM, API_RUNTIME, HIP_UNSUPPORTED}; // Texture Reference Mngmnt // Texture reference filtering modes From 52af9a3cf36cbac60851f83876ca816e3380c6a6 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 15 Jun 2017 00:21:47 +0530 Subject: [PATCH 153/171] Arguments validation in hipDeviceGetPCIBusId Change-Id: I89770517c3ac94e4bf476344d27c18f03cfcde08 [ROCm/clr commit: 3f25611692e2ea54c12cfc276fe31e861160b901] --- projects/clr/hipamd/src/hip_device.cpp | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/projects/clr/hipamd/src/hip_device.cpp b/projects/clr/hipamd/src/hip_device.cpp index 05db4c2b30..2bb9970d35 100644 --- a/projects/clr/hipamd/src/hip_device.cpp +++ b/projects/clr/hipamd/src/hip_device.cpp @@ -369,12 +369,24 @@ hipError_t hipDeviceGetName(char *name,int len,hipDevice_t device) hipError_t hipDeviceGetPCIBusId (char *pciBusId,int len, int device) { HIP_INIT_API(pciBusId, len, device); - hipError_t e = hipSuccess; - int tempPciBusId = 0; - e = ihipDeviceGetAttribute( &tempPciBusId, hipDeviceAttributePciBusId, device); - if( e == hipSuccess) { - std::string tempPciStr = std::to_string(tempPciBusId); - memcpy( pciBusId , tempPciStr.c_str() , tempPciStr.length() ); + hipError_t e = hipErrorInvalidValue; + int deviceCount = 0; + ihipGetDeviceCount( &deviceCount ); + if((device > deviceCount) || (device < 0)) { + e = hipErrorInvalidDevice; + } else { + if((pciBusId != nullptr) && (len > 0)) { + int tempPciBusId = 0; + e = ihipDeviceGetAttribute( &tempPciBusId, hipDeviceAttributePciBusId, device); + if( e == hipSuccess) { + std::string tempPciStr = std::to_string(tempPciBusId); + if( len < tempPciStr.length()){ + e = hipErrorInvalidValue; + } else { + memcpy( pciBusId , tempPciStr.c_str() , tempPciStr.length() ); + } + } + } } return ihipLogStatus(e); } From cd12c963b061bf88f05a4b87528436f02607fc42 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 16 Jun 2017 09:02:26 -0500 Subject: [PATCH 154/171] removed bad copy constructor Change-Id: I661991d9d43941a61848b0b8e9879c0bfa811b40 [ROCm/clr commit: 34e14bb02df84572e33aa6319f918a28f5d0b4fe] --- projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h index 9da34d9f32..93c82cc0cb 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h @@ -36,25 +36,21 @@ THE SOFTWARE. #define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \ __device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x) { } \ __device__ __host__ type(const type& val) : x(val.x) { } \ __device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \ __device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \ __device__ __host__ type(const type& val) : x(val.x), y(val.y) { } \ __device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \ __device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \ __device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { } \ __device__ __host__ ~type() {} #define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \ __device__ __host__ type() {} \ -__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ __device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \ __device__ __host__ ~type() {} From 16341302671226916e4ba7ca068cff4adda4ade8 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 16 Jun 2017 09:07:06 -0500 Subject: [PATCH 155/171] fixed float2int functions Change-Id: I67be79149f06daacf0f0d131bdedabf294126248 [ROCm/clr commit: f139d5a52f54367a90a98ae32ee555ca85bb7e75] --- projects/clr/hipamd/src/device_functions.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/projects/clr/hipamd/src/device_functions.cpp b/projects/clr/hipamd/src/device_functions.cpp index 10d8d3ab89..615ae4d0b7 100644 --- a/projects/clr/hipamd/src/device_functions.cpp +++ b/projects/clr/hipamd/src/device_functions.cpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights @@ -149,19 +149,19 @@ __device__ long long int __double_as_longlong(double x) return hold64.sli; } -__device__ int float2int_rd(float x) +__device__ int __float2int_rd(float x) { return (int)x; } -__device__ int float2int_rn(float x) +__device__ int __float2int_rn(float x) { return (int)x; } -__device__ int float2int_ru(float x) +__device__ int __float2int_ru(float x) { return (int)x; } -__device__ int float2int_rz(float x) +__device__ int __float2int_rz(float x) { return (int)x; } From 20dd968547abce8f824d98984045541e99bcb03a Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Sun, 18 Jun 2017 12:31:31 +0530 Subject: [PATCH 156/171] Abort device function in HIP/HCC, need new HCC Change-Id: I4195ab75e9b7b48c8b8128d6925ddc0fa5e9e009 [ROCm/clr commit: d9935cd089d610e23593f3787057d6d485c0d67d] --- projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h | 3 +++ projects/clr/hipamd/src/device_util.cpp | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h index 95826f9b60..da3b7ba50e 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -174,6 +174,9 @@ static constexpr int warpSize = 64; __device__ long long int clock64(); __device__ clock_t clock(); +//abort +__device__ void abort(); + //atomicAdd() __device__ int atomicAdd(int* address, int val); __device__ unsigned int atomicAdd(unsigned int* address, diff --git a/projects/clr/hipamd/src/device_util.cpp b/projects/clr/hipamd/src/device_util.cpp index 062372f0f4..1efda02933 100644 --- a/projects/clr/hipamd/src/device_util.cpp +++ b/projects/clr/hipamd/src/device_util.cpp @@ -839,6 +839,11 @@ __device__ float __hip_ynf(int n, float x) __device__ long long int clock64() { return (long long int)hc::__cycle_u64(); }; __device__ clock_t clock() { return (clock_t)hc::__cycle_u64(); }; +//abort +__device__ void abort() +{ + return hc::abort(); +} //atomicAdd() __device__ int atomicAdd(int* address, int val) From 3e29f81050c81d8bfec6f615082785bde0b74702 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Tue, 20 Jun 2017 09:38:56 +0530 Subject: [PATCH 157/171] Added device side abort function in HIP/NVCC Change-Id: I6ae35a72a8b9c34852619f02da1a046c8d3b2ed3 [ROCm/clr commit: 0fd6b59e22f9991f3c5767b03e1ff197d8ea40cd] --- projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime.h index 80da388007..8c08f3d151 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime.h @@ -109,6 +109,10 @@ kernelName<<>>(__VA_ARGS__);\ #define HIP_DYNAMIC_SHARED_ATTRIBUTE +#ifdef __HIP_DEVICE_COMPILE__ +#define abort() {asm("trap;");} +#endif + #endif #endif From 512482baddfa3262c9d3df3223ebbd16cac7655d Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Tue, 20 Jun 2017 11:35:52 -0500 Subject: [PATCH 158/171] removed rm for /opt/rocm/hip/src in inline asm sample Change-Id: I0c02bccd4cd35e01a8e889ea1e586ea8baf0ab90 [ROCm/clr commit: fba69b1ce7323785b429c01a4791def68c1edc21] --- projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Makefile b/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Makefile index 77a7699635..6ad3c201bd 100644 --- a/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Makefile +++ b/projects/clr/hipamd/samples/2_Cookbook/10_inline_asm/Makefile @@ -32,4 +32,4 @@ test: $(EXECUTABLE) clean: rm -f $(EXECUTABLE) rm -f $(OBJECTS) - rm -f $(HIP_PATH)/src/*.o + From 8c20ecdfde9d03bd53a19d1044ee07ee73ce6964 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 22 Jun 2017 21:53:32 +0300 Subject: [PATCH 159/171] [HIPIFY] Sync more CUDA Driver API functions. + 4.14. Event Management + 4.15. Execution Control ToDo: 4.16 - 4.31 modules of CUDA Driver API. [ROCm/clr commit: cbb5c63dd674eddba5c4cd56df7ccca5c9797cd9] --- .../CUDA_Driver_API_functions_supported_by_HIP.md | 11 ++++++++++- projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp | 4 ++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md b/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md index 0b3bb540bf..d797b31832 100644 --- a/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md +++ b/projects/clr/hipamd/docs/markdown/CUDA_Driver_API_functions_supported_by_HIP.md @@ -583,12 +583,21 @@ | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| - +| `cuEventCreate` | `hipEventCreate` | Creates an event. | +| `cuEventDestroy` | `hipEventDestroy` | Destroys an event. | +| `cuEventElapsedTime` | `hipEventElapsedTime` | Computes the elapsed time between two events. | +| `cuEventQuery` | `hipEventQuery` | Queries an event's status. | +| `cuEventRecord` | `hipEventRecord` | Records an event. | +| `cuEventSynchronize` | `hipEventSynchronize` | Waits for an event to complete. | ## **15. Execution Control** | **CUDA** | **HIP** | **CUDA description** | |-----------------------------------------------------------|-------------------------------|--------------------------------------------------------------------------------------------------------------------------------| +| `cuFuncGetAttribute` | | Returns information about a function. | +| `cuFuncSetCacheConfig` | `hipFuncSetCacheConfig` | Sets the preferred cache configuration for a device function. | +| `cuFuncSetSharedMemConfig` | | Sets the shared memory configuration for a device function. | +| `cuLaunchKernel` | `hipModuleLaunchKernel` | Launches a CUDA function. | ## **16. Execution Control [DEPRECATED]** diff --git a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp index 7f9fefa7f9..9b58173899 100644 --- a/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ b/projects/clr/hipamd/hipify-clang/src/Cuda2Hip.cpp @@ -987,6 +987,10 @@ struct cuda2hipMap { cuda2hipRename["cuEventRecord"] = {"hipEventRecord", CONV_EVENT, API_DRIVER}; cuda2hipRename["cuEventSynchronize"] = {"hipEventSynchronize", CONV_EVENT, API_DRIVER}; + // Execution Control + cuda2hipRename["cuFuncGetAttribute"] = {"hipFuncGetAttribute", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; + cuda2hipRename["cuFuncSetCacheConfig"] = {"hipFuncSetCacheConfig", CONV_MODULE, API_DRIVER}; + cuda2hipRename["cuFuncSetSharedMemConfig"] = {"hipFuncSetSharedMemConfig", CONV_MODULE, API_DRIVER, HIP_UNSUPPORTED}; cuda2hipRename["cuLaunchKernel"] = {"hipModuleLaunchKernel", CONV_MODULE, API_DRIVER}; // Streams From 92e8c134904a1abbe546b7d6d74b6b73ceb5070d Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 23 Jun 2017 21:59:24 +0300 Subject: [PATCH 160/171] [HIPIFY] [DOC] Fix typo. [ROCm/clr commit: ec2c1316a8bc3180dc64a4e235f0b0f3e0b623e8] --- projects/clr/hipamd/hipify-clang/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/hipify-clang/README.md b/projects/clr/hipamd/hipify-clang/README.md index c0d74dbe48..d74c53f187 100644 --- a/projects/clr/hipamd/hipify-clang/README.md +++ b/projects/clr/hipamd/hipify-clang/README.md @@ -70,7 +70,7 @@ To set additional options like Language Selection (only "-x cuda" is supported), Delimiter "--" is used to separate hipify-clang options (before the delimiter) from clang options (after the delimiter). It is strongly recommended to always specify the delimiter, even if there are no clang specific options at all, in order to avoid possible errors regarding compilation database; in such case delimeter should be the last option in hipify-clang's command line. -Option "-x clang" is also worth specifying in order to convert source CUDA files with extensions other than standard extensions (*.cu, *.cuh). +Option "-x cuda" is also worth specifying in order to convert source CUDA files with extensions other than standard extensions (*.cu, *.cuh). ## Disclaimer From f1bed37a347570195fb0846ecee7a290d3f92b45 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 23 Jun 2017 10:38:29 -0500 Subject: [PATCH 161/171] Clean up old USE_* and RELEASE.md notes. [ROCm/clr commit: 42882ddf9c9ba0b8a9f098314f8e3a69be8eadf0] --- projects/clr/hipamd/RELEASE.md | 6 ------ .../clr/hipamd/include/hip/hcc_detail/host_defines.h | 5 ----- projects/clr/hipamd/src/hip_hcc.cpp | 3 --- projects/clr/hipamd/tests/src/hipPointerAttrib.cpp | 9 --------- .../tests/src/runtimeApi/memory/p2p_copy_coherency.cpp | 4 ---- 5 files changed, 27 deletions(-) diff --git a/projects/clr/hipamd/RELEASE.md b/projects/clr/hipamd/RELEASE.md index 5787c59881..d6f3ec594c 100644 --- a/projects/clr/hipamd/RELEASE.md +++ b/projects/clr/hipamd/RELEASE.md @@ -2,12 +2,6 @@ We have attempted to document known bugs and limitations - in particular the [HIP Kernel Language](docs/markdown/hip_kernel_language.md) document uses the phrase "Under Development", and the [HIP Runtime API bug list](http://gpuopen-professionalcompute-tools.github.io/HIP/bug.html) lists known bugs. -Upcoming: -- Stability: Enforce periodic host synchronization to reclaim resources if the application has launched a large - number of commands (>1K) without synchronizing. -- Register keyword now silently ignored on HCC (previously would emit warning). -- Doc updates: Add some more frequently asked questions to FAQ, fix TOC in some files, review. -- Cookbook. =================================================================================================== diff --git a/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h b/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h index 140cbb0678..212fd650a3 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h @@ -28,7 +28,6 @@ THE SOFTWARE. #ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H #define HIP_INCLUDE_HIP_HCC_DETAIL_HOST_DEFINES_H -#define USE_PROMOTE_FREE_HCC 1 // Add guard to Generic Grid Launch method #ifndef GENERIC_GRID_LAUNCH @@ -61,11 +60,7 @@ THE SOFTWARE. */ // _restrict is supported by the compiler #define __shared__ tile_static -#if USE_PROMOTE_FREE_HCC==1 #define __constant__ __attribute__((hc)) -#else -#define __constant__ ADDRESS_SPACE_1 -#endif #else // Non-HCC compiler diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index d826a0cec3..061714070e 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -48,9 +48,6 @@ THE SOFTWARE. #include "env.h" -// needs HCC change for hc::no_scope -#define USE_NO_SCOPE 1 - //================================================================================================= //Global variables: //================================================================================================= diff --git a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp index 7a2ab64bea..bddbff5ce0 100644 --- a/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp +++ b/projects/clr/hipamd/tests/src/hipPointerAttrib.cpp @@ -32,7 +32,6 @@ THE SOFTWARE. #endif -#define USE_AV_COPY (__hcc_workweek__ >= 16351) size_t Nbytes = 0; @@ -410,21 +409,13 @@ void thread_noise_generator(int iters, size_t numBuffers, Dir addDir, Dir remove if (addDir == Up) { for (char *p = basePtr; p=0; p-=bufferSize) { -#if USE_AV_COPY hc::AmPointerInfo info(p, p, bufferSize, acc, false, false); hc::am_memtracker_add(p, info); -#else - hc::am_memtracker_add(p, bufferSize, acc, false); -#endif } } diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp index a5d79464d0..9fadebea1e 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -36,7 +36,6 @@ THE SOFTWARE. #define USE_HCC_MEMTRACKER 0 #endif -#define USE_HSA_COPY 1 int elementSizes[] = {16, 1024,524288}; int nSizes = sizeof(elementSizes) / sizeof(int); @@ -102,11 +101,8 @@ void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_ hipStream_t stepAStream = gpu0Stream; if (stepAIsCopy) { -#ifdef USE_HSA_COPY HIPCHECK(hipMemcpyAsync(dataGpu1, dataGpu0_0, sizeElements, hipMemcpyDeviceToDevice, stepAStream)); -#endif } else { - //assert(0); // not yet supported. unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, gpu0Stream, dataGpu0_0, dataGpu1, numElements); From ccc085e4db8fe62f4e6f558a8b25128b9d932042 Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 23 Jun 2017 10:39:16 -0500 Subject: [PATCH 162/171] Add option to pass names to HCC dispatch API (for debug) [ROCm/clr commit: c7382f7da606c0b4f3d1aec9653d6d9c82d0febd] --- projects/clr/hipamd/src/hip_module.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index 2a3bfabc28..b8c032da27 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -451,7 +451,13 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, hc::completion_future cf; lp.av->dispatch_hsa_kernel(&aql, config[1] /* kernarg*/, kernArgSize, - (startEvent || stopEvent) ? &cf : nullptr); + (startEvent || stopEvent) ? &cf : nullptr +#define USE_NAMED_KERNEL 0 +#if USE_NAMED_KERNEL + , f->_name.c_str() +#endif + ); + if (startEvent) { From 16b2a0b76eeb7cdc419cd877535145e935924d3e Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Fri, 23 Jun 2017 17:12:04 -0500 Subject: [PATCH 163/171] Add docs for launch_bounds. [ROCm/clr commit: 76c23972816ae6298633739663bee55ce35cf819] --- projects/clr/hipamd/docs/markdown/hip_faq.md | 1 - .../docs/markdown/hip_kernel_language.md | 55 ++++++++++++++----- 2 files changed, 42 insertions(+), 14 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_faq.md b/projects/clr/hipamd/docs/markdown/hip_faq.md index 07ec5f1d8b..ddf70f2875 100644 --- a/projects/clr/hipamd/docs/markdown/hip_faq.md +++ b/projects/clr/hipamd/docs/markdown/hip_faq.md @@ -70,7 +70,6 @@ See the [API Support Table](CUDA_Runtime_API_functions_supported_by_HIP.md) for - printf - assert - `__restrict__` - - `__launch_bounds__` - `__threadfence*_`, `__syncthreads*` - Unbounded loop unroll diff --git a/projects/clr/hipamd/docs/markdown/hip_kernel_language.md b/projects/clr/hipamd/docs/markdown/hip_kernel_language.md index 3cb7b17a0c..0485188a1f 100644 --- a/projects/clr/hipamd/docs/markdown/hip_kernel_language.md +++ b/projects/clr/hipamd/docs/markdown/hip_kernel_language.md @@ -610,30 +610,59 @@ Device-side dynamic global memory allocation is under development. HIP now incl implementation of malloc and free that can be called from device functions. ## `__launch_bounds__` -GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) that are shared among the active warps. Using more resources can increase the kernel’s IPC, but it reduces the resources available for other warps and limits the number of warps that can run simultaneously. Thus, GPUs exhibit a complex relationship between resource usage and performance. `__launch_bounds__` allows the application to provide usage hints that influence the resources (primarily registers) employed by the generated code. It’s a function attribute that must be attached to a `__global__` function: + + +GPU multiprocessors have a fixed pool of resources (primarily registers and shared memory) which are shared by the actively running warps. Using more resources can increase IPC of the kernel but reduces the resources available for other warps and limits the number of warps that can be simulaneously running. Thus GPUs have a complex relationship between resource usage and performance. + +__hip_launch_bounds__ allows the application to provide usage hints that influence the resources (primarily registers) used by the generated code. +__hip_launch_bounds__ is a function attribute that must be attached to a __global__ function: ``` -__global__ void -`__launch_bounds__`(requiredMaxThreadsPerBlock, minBlocksPerMultiprocessor) +__global__ void `__launch_bounds__`(MAX_THREADS_PER_BLOCK, MIN_WARPS_PER_EU) MyKernel(...) ... MyKernel(hipGridLaunch lp, ...) ... ``` -`__launch_bounds__` supports two parameters: +__launch_bounds__ supports two parameters: +- MAX_THREADS_PER_BLOCK - The programmers guarantees that kernel will be launched with threads less than MAX_THREADS_PER_BLOCK. (On NVCC this maps to the .maxntid PTX directive). If no launch_bounds is specified, MAX_THREADS_PER_BLOCK is the maximum block size supported by the device (typically 1024 or larger). Specifying MAX_THREADS_PER_BLOCK less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation that supports all possible block sizes at launch time. +The threads-per-block is the product of (hipBlockDim_x * hipBlockDim_y * hipBlockDim_z). +- MIN_WARPS_PER_EU - directs the compiler to minimize resource usage so that the requested number of warps can be simultaneously active on a multi-processor. Since active warps compete for the same fixed pool of resources, the compiler must reduce resources required by each warp(primarily registers). MIN_WARPS_PER_EU is optional and defaults to 1 if not specified. Specifying a MIN_WARPS_PER_EU greater than the default 1 effectively constrains the compiler's resource usage. -- **requiredMaxThreadsPerBlock**---the programmer guarantees that the kernel will launch with threadsPerBlock less than requiredMaxThreadsPerBlock. (In nvcc, this parameter maps to the _.maxntid_ PTX directive; in hcc, it maps to the HSAIL _requiredworkgroupsize_ directive.) If launch_bounds is unspecified, requiredMaxThreadsPerBlock is the maximum block size that the device supports (typically 1,024 or larger). Specifying requiredMaxThreadsPerBlock less than the maximum effectively allows the compiler to use more resources than a default unconstrained compilation supporting all possible block sizes at launch time. The threadsPerBlock value is the product hipBlockDim_x * hipBlockDim_y * hipBlockDim_z. -- **minBlocksPerMultiprocessor**---directs the compiler to minimize resource usage so that the requested number of blocks can be simultaneously active on a multiprocessor. Because active blocks compete for the same fixed resource pool, the compiler must reduce the resource requirements of each block (primarily registers). minBlocksPerMultiprocessor is optional and defaults to 1 if unspecified. Selecting a minBlocksPerMultiprocessor value greater than 1 effectively constrains the compiler's resource usage. +### Compiler Impact +The compiler uses these parameters as follows: +- The compiler uses the hints only to manage register usage, and does not automatically reduce shared memory or other resources. +- Compilation fails if compiler cannot generate a kernel which meets the requirements of the specified launch bounds. +- From MAX_THREADS_PER_BLOCK, the compiler derives the maximum number of warps/block that can be used at launch time. +Values of MAX_THREADS_PER_BLOCK less than the default allows the compiler to use a larger pool of registers : each warp uses registers, and this hint constains the launch to a warps/block size which is less than maximum. +- From MIN_WARPS_PER_EU, the compiler derives a maximum number of registers that can be used by the kernel (to meet the required #simultaneous active blocks). +If MIN_WARPS_PER_EU is 1, then the kernel can use all registers supported by the multiprocessor. +- The compiler ensures that the registers used in the kernel is less than both allowed maximums, typically by spilling registers (to shared or global memory), or by using more instructions. +- The compiler may use hueristics to increase register usage, or may simply be able to avoid spilling. The MAX_THREADS_PER_BLOCK is particularly useful in this cases, since it allows the compiler to use more registers and avoid situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size that is never used at launch time. -The compiler uses these two parameters as follows: -- It employs the hints only to manage register usage and does not automatically reduce shared memory or other resources. -- Compilation fails if the compiler cannot generate a kernel that meets the requirements of the specified launch bounds. -- From requiredMaxThreadsPerBlock, the compiler derives the maximum number of warps per block that are usable at launch time. Values less than the default allow the compiler to use a larger register pool: each warp uses registers, and this hint constrains the launch to a warps-per-block size less than maximum. -- From minBlocksPerMultiprocessor, the compiler derives a maximum number of registers that the kernel can use (to meet the required number of simultaneously active blocks). If the value is 1, the kernel can use all registers supported by the multiprocessor. -The compiler ensures that the kernel uses fewer registers than both allowed maxima specify, typically by spilling to shared memory or using more instructions. It may use heuristics to increase register usage or may simply be able to avoid spilling. The requiredMaxThreadsPerBlock parameter is particularly useful in this case, since it allows the compiler to use more registers---avoiding situations where the compiler constrains the register usage (potentially spilling) to meet the requirements of a large block size never sees use at launch time. +### CU and EU Definitions +A compute unit (CU) is responsible for executing the waves of a work-group. It is composed of one or more execution units (EU) which are responsible for executing waves. An EU can have enough resources to maintain the state of more than one executing wave. This allows an EU to hide latency by switching between waves in a similar way to symmetric multithreading on a CPU. In order to allow the state for multiple waves to fit on an EU, the resources used by a single wave have to be limited. Limiting such resources can allow greater latency hiding, but can result in having to spill some register state to memory. This attribute allows an advanced developer to tune the number of waves that are capable of fitting within the resources of an EU. It can be used to ensure at least a certain number will fit to help hide latency, and can also be used to ensure no more than a certain number will fit to limit cache thrashing. + +### Porting from CUDA __launch_bounds +CUDA defines a __launch_bounds which is also designed to control occupancy: +``` +__launch_bounds(MAX_THREADS_PER_BLOCK, MIN_BLOCKS_PER_MULTIPROCESSOR) +``` -HIP/hcc will parse the `launch_bounds` attribute but silently ignores the performance hint. Full support is under development. +- The second parameter __launch_bounds parameters must be converted to the format used __hip_launch_bounds, which uses warps and execution-units rather than blocks and multi-processors ( This conversion is performed automatically by the clang hipify tools.) +``` +MIN_WARPS_PER_EXECUTION_UNIT = (MIN_BLOCKS_PER_MULTIPROCESSOR * MAX_THREADS_PER_BLOCK) / 32 +``` +The key differences in the interface are: +- Warps (rather than blocks): +The developer is trying to tell the compiler to control resource utilization to guarantee some amount of active Warps/EU for latency hiding. Specifying active warps in terms of blocks appears to hide the micro-architectural details of the warp size, but makes the interface more confusing since the developer ultimately needs to compute the number of warps to obtain the desired level of control. +- Execution Units (rather than multiProcessor): +The use of execution units rather than multiprocessors provides support for architectures with multiple execution units/multi-processor. For example, the AMD GCN architecture has 4 execution units per multiProcessor. The hipDeviceProps has a field executionUnitsPerMultiprocessor. +Platform-specific coding techniques such as #ifdef can be used to specify different launch_bounds for NVCC and HCC platforms, if desired. + + +### maxregcount Unlike nvcc, hcc does not support the "--maxregcount" option. Instead, users are encouraged to use the hip_launch_bounds directive since the parameters are more intuitive and portable than micro-architecture details like registers, and also the directive allows per-kernel control rather than an entire file. hip_launch_bounds works on both hcc and nvcc targets. From ede75a84ab53177d8f223ab40814e06b1c7138b9 Mon Sep 17 00:00:00 2001 From: Aditya Atluri Date: Fri, 23 Jun 2017 19:05:34 -0500 Subject: [PATCH 164/171] fixed default args for symbol memcpy apis Change-Id: Ie0b63f8b9c5535eb3946bd6af3f30fe71a015244 [ROCm/clr commit: 3043766899e5f81fcf591e80648928572e253a04] --- .../clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h | 8 ++++---- .../clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index fde38c8395..724bf09b21 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -1194,7 +1194,7 @@ hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t siz * * @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyFromSymbol, hipMemcpyAsync, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync, hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync, hipMemcpyFromSymbolAsync */ -hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind); +hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyHostToDevice); /** @@ -1214,11 +1214,11 @@ hipError_t hipMemcpyToSymbol(const void* symbolName, const void *src, size_t siz * * @see hipMemcpy, hipMemcpy2D, hipMemcpyToArray, hipMemcpy2DToArray, hipMemcpyFromArray, hipMemcpy2DFromArray, hipMemcpyArrayToArray, hipMemcpy2DArrayToArray, hipMemcpyFromSymbol, hipMemcpyAsync, hipMemcpy2DAsync, hipMemcpyToArrayAsync, hipMemcpy2DToArrayAsync, hipMemcpyFromArrayAsync, hipMemcpy2DFromArrayAsync, hipMemcpyToSymbolAsync, hipMemcpyFromSymbolAsync */ -hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream); +hipError_t hipMemcpyToSymbolAsync(const void* symbolName, const void *src, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0); -hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind); +hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyDeviceToHost); -hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream); +hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0); /** * @brief Copy data from src to dst asynchronously. diff --git a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h index f92523a3e3..b1011aac6c 100644 --- a/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/nvcc_detail/hip_runtime_api.h @@ -360,16 +360,16 @@ inline static hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType))); } -inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, size_t offset, hipMemcpyKind copyType, hipStream_t stream) { - return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType))); +inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src, size_t sizeBytes, size_t offset, hipMemcpyKind copyType, hipStream_t stream = 0) { + return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType), stream)); } -inline static hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind) +inline static hipError_t hipMemcpyFromSymbol(void *dst, const void* symbolName, size_t sizeBytes, size_t offset = 0, hipMemcpyKind kind = hipMemcpyDeviceToHost) { return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind))); } -inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream) +inline static hipError_t hipMemcpyFromSymbolAsync(void *dst, const void* symbolName, size_t sizeBytes, size_t offset, hipMemcpyKind kind, hipStream_t stream = 0) { return hipCUDAErrorTohipError(cudaMemcpyFromSymbolAsync(dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind), stream)); } From 2582679911b3acc36133f303394e9e70e720d0b4 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Mon, 26 Jun 2017 15:29:38 -0500 Subject: [PATCH 165/171] Add support of HIP_HIDDEN_FREE_MEM, to deduct the returned available memory from hipMemGetInfo API, measured in MB. Change-Id: I7a8260c12e032e04e26611db4c38c893a29f2653 [ROCm/clr commit: e5ce58530729e4a1604453452d86e06480204867] --- projects/clr/hipamd/src/hip_hcc.cpp | 5 +++-- projects/clr/hipamd/src/hip_hcc_internal.h | 2 +- projects/clr/hipamd/src/hip_memory.cpp | 4 ++++ 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 061714070e..364db80537 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -78,6 +78,7 @@ int HIP_FORCE_P2P_HOST = 0; int HIP_FAIL_SOC = 0; int HIP_DENY_PEER_ACCESS = 0; +int HIP_HIDDEN_FREE_MEM = 0; // Force async copies to actually use the synchronous copy interface. int HIP_FORCE_SYNC_COPY = 0; @@ -1204,8 +1205,8 @@ void HipReadEnv() tokenize(HIP_LAUNCH_BLOCKING_KERNELS, ',', &g_hipLaunchBlockingKernels); } READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed. Impacts hipMemcpyAsync, hipMemsetAsync." ); - - + + READ_ENV_I(release, HIP_HIDDEN_FREE_MEM, 0, "Amount of memory to hide from the free memory reported by hipMemGetInfo, specified in MB. Impacts hipMemGetInfo." ); READ_ENV_C(release, HIP_DB, 0, "Print debug info. Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback); if ((HIP_DB & (1<_acc, &deviceMemSize, &hostMemSize, &userMemSize); *free = device->_props.totalGlobalMem - deviceMemSize; + + // Deduct the amount of memory from the free memory reported from the system + if(HIP_HIDDEN_FREE_MEM) + *free -= (size_t)HIP_HIDDEN_FREE_MEM*1024*1024; } else { e = hipErrorInvalidValue; From 9f949c5ec33f2a8c062a31c8edfb7eaa55c80118 Mon Sep 17 00:00:00 2001 From: sunway513 Date: Mon, 26 Jun 2017 22:47:22 +0000 Subject: [PATCH 166/171] Fix docs for HIP_TRACE_API bit masks. [ROCm/clr commit: c80d906624ff215f28ffd4c15476d5a221a73773] --- projects/clr/hipamd/docs/markdown/hip_profiling.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_profiling.md b/projects/clr/hipamd/docs/markdown/hip_profiling.md index ef349ef2a5..ac277c8433 100644 --- a/projects/clr/hipamd/docs/markdown/hip_profiling.md +++ b/projects/clr/hipamd/docs/markdown/hip_profiling.md @@ -268,9 +268,12 @@ PASSED! ``` HIP_TRACE_API supports multiple levels of debug information: - - 0x1 = print all HIP APIs - - 0x2 = print HIP APIs which initiate GPU kernels, copies, or memsets. Includes hipLaunchKernel, hipMemcpy*, hipMemset*. - - 0x4 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree. + - 0x1 = print all HIP APIs. This is the most verbose setting; the flags below allow selecting a subset. + - 0x2 = print HIP APIs which initiate GPU kernel commands. Includes hipLaunchKernel, hipLaunchModuleKernel + - 0x4 = print HIP APIs which initiate GPU memory commands. Includes hipMemcpy*, hipMemset*. + - 0x8 = print HIP APIs which allocate or free memory. Includes hipMalloc, hipHostMalloc, hipFree, hipHostFree. + +These can be combined. For example, HIP_TRACE_API=6 shows a concise view of the HIP commands (both kernel and memory) that are sent to the GPU. #### Color From 639a6d5e46fc53058ba224f2c0f257fc516f3fee Mon Sep 17 00:00:00 2001 From: Ben Sander Date: Tue, 27 Jun 2017 12:17:12 -0500 Subject: [PATCH 167/171] Set default HIP_HIDDEN_FREE_MEM [ROCm/clr commit: eb2c5e166c6412f3f25b973c21508122f462dbb2] --- projects/clr/hipamd/src/hip_hcc.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/src/hip_hcc.cpp b/projects/clr/hipamd/src/hip_hcc.cpp index 364db80537..be591f2f04 100644 --- a/projects/clr/hipamd/src/hip_hcc.cpp +++ b/projects/clr/hipamd/src/hip_hcc.cpp @@ -78,7 +78,7 @@ int HIP_FORCE_P2P_HOST = 0; int HIP_FAIL_SOC = 0; int HIP_DENY_PEER_ACCESS = 0; -int HIP_HIDDEN_FREE_MEM = 0; +int HIP_HIDDEN_FREE_MEM = 256; // Force async copies to actually use the synchronous copy interface. int HIP_FORCE_SYNC_COPY = 0; From 8a45f03573b69122c356f548f460c5a074442354 Mon Sep 17 00:00:00 2001 From: "Sun, Peng" Date: Tue, 27 Jun 2017 14:15:16 -0500 Subject: [PATCH 168/171] Remove some warning debug info and add weak attribute back to GGL __global__ define Change-Id: I2021b107dda697b1262d44fa1506465e94a3916b [ROCm/clr commit: 3d2a729580e5c620674d512372ac75426058f2e7] --- projects/clr/hipamd/include/hip/hcc_detail/host_defines.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h b/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h index 212fd650a3..b2e7ac2617 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/host_defines.h @@ -41,13 +41,10 @@ THE SOFTWARE. #define __host__ __attribute__((cpu)) #define __device__ __attribute__((hc)) -//#warning "HOST DEFINE header included" #if GENERIC_GRID_LAUNCH == 0 -//#warning "original global define reached" #define __global__ __attribute__((hc_grid_launch)) __attribute__((used)) #else -//#warning "GGL global define reached" -#define __global__ __attribute__((annotate("hip__global__"), hc, used)) +#define __global__ __attribute__((annotate("hip__global__"), hc, used, weak)) #endif //GENERIC_GRID_LAUNCH #define __noinline__ __attribute__((noinline)) From 0d555fa3f49ec554eca4dd6e019de74fc4a85b37 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 28 Jun 2017 10:26:04 +0530 Subject: [PATCH 169/171] [build] link libCXActivityLogger.so when COMPILE_HIP_ATP_MARKER=1 Change-Id: I0bfffd924cd858bec7436acf3ccb1e3375172f27 [ROCm/clr commit: bb928ee8ecf6feb537ac07cbc136fc66f1327da2] --- projects/clr/hipamd/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/clr/hipamd/CMakeLists.txt b/projects/clr/hipamd/CMakeLists.txt index eee1a14a8a..b3ea5a3ca3 100644 --- a/projects/clr/hipamd/CMakeLists.txt +++ b/projects/clr/hipamd/CMakeLists.txt @@ -193,6 +193,9 @@ if(HIP_PLATFORM STREQUAL "hcc") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${HCC_LD_FLAGS} -Wl,-Bsymbolic") #find_package(LLVM HINTS ${HCC_HOME}/compiler/lib/cmake) set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --amdgpu-target=gfx701 --amdgpu-target=gfx801 --amdgpu-target=gfx802 --amdgpu-target=gfx803 --amdgpu-target=gfx900") + if(COMPILE_HIP_ATP_MARKER) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L/opt/rocm/profiler/CXLActivityLogger/bin/x86_64 -lCXLActivityLogger") + endif() add_library(hip_hcc SHARED ${SOURCE_FILES_RUNTIME}) target_link_libraries(hip_hcc PRIVATE hc_am) #target_link_libraries(hip_hcc PUBLIC LLVMAMDGPUUtils) From 7b011a5996096f537b00449adb6a12615f54ec77 Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 28 Jun 2017 11:32:25 +0530 Subject: [PATCH 170/171] Update toc in markdown documentation Change-Id: I6da7053672b306442f3640fff3471efe25593870 [ROCm/clr commit: f907a986a1445868281610e0c9c1f1fd5921111d] --- projects/clr/hipamd/docs/markdown/hip_bugs.md | 4 +++- projects/clr/hipamd/docs/markdown/hip_kernel_language.md | 4 ++++ projects/clr/hipamd/docs/markdown/hip_profiling.md | 5 ----- projects/clr/hipamd/hipify-clang/README.md | 2 +- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_bugs.md b/projects/clr/hipamd/docs/markdown/hip_bugs.md index 91b2a5a019..78f0e53467 100644 --- a/projects/clr/hipamd/docs/markdown/hip_bugs.md +++ b/projects/clr/hipamd/docs/markdown/hip_bugs.md @@ -1,7 +1,9 @@ -# HIP Bugs +# HIP Bugs + - [Errors related to undefined reference to `__hcLaunchKernel__***__grid_launch_parm**`](#errors-related-to-undefined-reference-to-__hclaunchkernel____grid_launch_parm) +- [Can't find kernels inside dynamic linked library](#cant-find-kernels-inside-dynamic-linked-library) - [What is the current limitation of HIP Generic Grid Launch method?](#what-is-the-current-limitation-of-hip-generic-grid-launch-method) - [Errors related to `no matching constructor`](#errors-related-to-no-matching-constructor) - [HIP is more restrictive in enforcing restrictions](#hip-is-more-restrictive-in-enforcing-restrictions) diff --git a/projects/clr/hipamd/docs/markdown/hip_kernel_language.md b/projects/clr/hipamd/docs/markdown/hip_kernel_language.md index 0485188a1f..cfa5d0f871 100644 --- a/projects/clr/hipamd/docs/markdown/hip_kernel_language.md +++ b/projects/clr/hipamd/docs/markdown/hip_kernel_language.md @@ -40,6 +40,10 @@ - [Printf](#printf) - [Device-Side Dynamic Global Memory Allocation](#device-side-dynamic-global-memory-allocation) - [`__launch_bounds__`](#__launch_bounds__) + * [Compiler Impact](#compiler-impact) + * [CU and EU Definitions](#cu-and-eu-definitions) + * [Porting from CUDA __launch_bounds](#porting-from-cuda-__launch_bounds) + * [maxregcount](#maxregcount) - [Register Keyword](#register-keyword) - [Pragma Unroll](#pragma-unroll) - [In-Line Assembly](#in-line-assembly) diff --git a/projects/clr/hipamd/docs/markdown/hip_profiling.md b/projects/clr/hipamd/docs/markdown/hip_profiling.md index ac277c8433..a659216044 100644 --- a/projects/clr/hipamd/docs/markdown/hip_profiling.md +++ b/projects/clr/hipamd/docs/markdown/hip_profiling.md @@ -23,11 +23,6 @@ This document starts with some of the general capabilities of CodeXL and then de - [Tracing and Debug](#tracing-and-debug) * [Tracing HIP APIs](#tracing-hip-apis) + [Color](#color) - * [Using HIP_DB](#using-hip_db) - * [Using ltrace](#using-ltrace) - * [Chicken bits](#chicken-bits) - * [Debugging HIP Applications](#debugging-hip-applications) - * [General Debugging Tips](#general-debugging-tips) diff --git a/projects/clr/hipamd/hipify-clang/README.md b/projects/clr/hipamd/hipify-clang/README.md index d74c53f187..20456f3bff 100644 --- a/projects/clr/hipamd/hipify-clang/README.md +++ b/projects/clr/hipamd/hipify-clang/README.md @@ -5,7 +5,7 @@ - [Using hipify-clang](#using-hipify-clang) * [Build and install](#build-and-install) * [Running and using hipify-clang](#running-and-using-hipify-clang) - + [Disclaimer](#disclaimer) +- [Disclaimer](#disclaimer) From e775308c65a80f9bf473e8a15533c90b6506794e Mon Sep 17 00:00:00 2001 From: Maneesh Gupta Date: Wed, 28 Jun 2017 11:32:40 +0530 Subject: [PATCH 171/171] Update release notes Change-Id: I5001ef03692159fcf9825102b37066ec26e6b8d2 [ROCm/clr commit: c782c68b1fb0212d97fa3d07018a46dda301e15a] --- projects/clr/hipamd/RELEASE.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/clr/hipamd/RELEASE.md b/projects/clr/hipamd/RELEASE.md index d6f3ec594c..a1e580b7b0 100644 --- a/projects/clr/hipamd/RELEASE.md +++ b/projects/clr/hipamd/RELEASE.md @@ -8,12 +8,15 @@ We have attempted to document known bugs and limitations - in particular the [HI ## Revision History: =================================================================================================== +Release: 1.2 +Date: 2017.06.29 - new APIs: hipMemcpy2DAsync, hipMallocPitch, hipHostMallocCoherent, hipHostMallocNonCoherent - added support for building hipify-clang using clang 3.9 - hipify-clang updates for CUDA 8.0 runtime+driver support - renamed hipify to hipify-perl - initial implementation of hipify-cmakefile - several documentation updates & bug fixes +- support for abort() function in device code ===================================================================================================