From 49e3b900bb968964c39b07efe400e66378e58d01 Mon Sep 17 00:00:00 2001
From: Tao Sang <tao.sang@amd.com>
Date: Tue, 15 Dec 2020 17:38:08 -0500
Subject: [PATCH 001/177] Remove hip-hcc codes: Part three

1.Rename include/hip/hcc_detail/ as include/hip/amd_detail/

2.Rename include/hip/nvcc_detail/ as include/hip/nvidia_detail/

3.Create __HIP_PLATFORM_AMD__ to replace __HIP_PLATFORM_HCC__

4.Create __HIP_PLATFORM_NVIDIA__ to replace __HIP_PLATFORM_NVCC__

After hcc_detail, nvcc_detail, __HIP_PLATFORM_HCC__ and __HIP_PLATFORM_NVCC__
have been removed from upstream, they will be removed from hip runtime.

Change-Id: I1ae457effd739d6c25bca203c1724b026be21fce
---
 .../hip/nvidia_detail/channel_descriptor.h    |   28 +
 hipnv/include/hip/nvidia_detail/hip_complex.h |  119 +
 .../nvidia_detail/hip_cooperative_groups.h    |   12 +
 hipnv/include/hip/nvidia_detail/hip_runtime.h |  123 +
 .../hip/nvidia_detail/hip_runtime_api.h       | 2045 +++++++++++++++++
 .../hip/nvidia_detail/hip_texture_types.h     |    6 +
 6 files changed, 2333 insertions(+)
 create mode 100644 hipnv/include/hip/nvidia_detail/channel_descriptor.h
 create mode 100644 hipnv/include/hip/nvidia_detail/hip_complex.h
 create mode 100644 hipnv/include/hip/nvidia_detail/hip_cooperative_groups.h
 create mode 100644 hipnv/include/hip/nvidia_detail/hip_runtime.h
 create mode 100644 hipnv/include/hip/nvidia_detail/hip_runtime_api.h
 create mode 100644 hipnv/include/hip/nvidia_detail/hip_texture_types.h

diff --git a/hipnv/include/hip/nvidia_detail/channel_descriptor.h b/hipnv/include/hip/nvidia_detail/channel_descriptor.h
new file mode 100644
index 0000000000..7eb0e65fda
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/channel_descriptor.h
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
+
+#include "channel_descriptor.h"
+
+#endif
diff --git a/hipnv/include/hip/nvidia_detail/hip_complex.h b/hipnv/include/hip/nvidia_detail/hip_complex.h
new file mode 100644
index 0000000000..10a53d1743
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/hip_complex.h
@@ -0,0 +1,119 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
+
+#include "cuComplex.h"
+
+typedef cuFloatComplex hipFloatComplex;
+
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return cuCrealf(z); }
+
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return cuCimagf(z); }
+
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+    return make_cuFloatComplex(a, b);
+}
+
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { return cuConjf(z); }
+
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
+    return cuCabsf(z) * cuCabsf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCaddf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCsubf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCmulf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCdivf(p, q);
+}
+
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return cuCabsf(z); }
+
+typedef cuDoubleComplex hipDoubleComplex;
+
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return cuCreal(z); }
+
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return cuCimag(z); }
+
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+    return make_cuDoubleComplex(a, b);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { return cuConj(z); }
+
+__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
+    return cuCabs(z) * cuCabs(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCadd(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCsub(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCmul(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCdiv(p, q);
+}
+
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return cuCabs(z); }
+
+typedef cuFloatComplex hipComplex;
+
+__device__ __host__ static inline hipComplex make_Complex(float x, float y) {
+    return make_cuComplex(x, y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+    return cuComplexDoubleToFloat(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+    return cuComplexFloatToDouble(z);
+}
+
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+    return cuCfmaf(p, q, r);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+                                                           hipDoubleComplex r) {
+    return cuCfma(p, q, r);
+}
+
+#endif
diff --git a/hipnv/include/hip/nvidia_detail/hip_cooperative_groups.h b/hipnv/include/hip/nvidia_detail/hip_cooperative_groups.h
new file mode 100644
index 0000000000..fc98ae2281
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/hip_cooperative_groups.h
@@ -0,0 +1,12 @@
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+// Include CUDA headers
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+
+// Include HIP wrapper headers around CUDA
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#endif // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime.h b/hipnv/include/hip/nvidia_detail/hip_runtime.h
new file mode 100644
index 0000000000..84414fb4a3
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/hip_runtime.h
@@ -0,0 +1,123 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
+
+#include <cuda_runtime.h>
+
+#include <hip/hip_runtime_api.h>
+
+#define HIP_KERNEL_NAME(...) __VA_ARGS__
+
+typedef int hipLaunchParm;
+
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+    do {                                                                                           \
+        kernelName<<<numBlocks, numThreads, memPerBlock, streamId>>>(__VA_ARGS__);                 \
+    } while (0)
+
+#define hipLaunchKernelGGL(kernelName, ...)  hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+
+#define hipReadModeElementType cudaReadModeElementType
+
+#ifdef __CUDA_ARCH__
+
+
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (__CUDA_ARCH__ >= 200)
+
+// 64-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (__CUDA_ARCH__ >= 120)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (__CUDA_ARCH__ >= 120)
+
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (__CUDA_ARCH__ >= 300)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (__CUDA_ARCH__ >= 350)
+
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (__CUDA_ARCH__ >= 200)
+
+// misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_3DGRID__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (__CUDA_ARCH__ >= 350)
+
+#endif
+
+#ifdef __CUDACC__
+
+
+#define hipThreadIdx_x threadIdx.x
+#define hipThreadIdx_y threadIdx.y
+#define hipThreadIdx_z threadIdx.z
+
+#define hipBlockIdx_x blockIdx.x
+#define hipBlockIdx_y blockIdx.y
+#define hipBlockIdx_z blockIdx.z
+
+#define hipBlockDim_x blockDim.x
+#define hipBlockDim_y blockDim.y
+#define hipBlockDim_z blockDim.z
+
+#define hipGridDim_x gridDim.x
+#define hipGridDim_y gridDim.y
+#define hipGridDim_z gridDim.z
+
+#define HIP_SYMBOL(X) &X
+
+/**
+ * extern __shared__
+ */
+
+#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
+
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define abort_()                                                                                    \
+    { asm("trap;"); }
+#undef assert
+#define assert(COND)                                                                               \
+    {                                                                                              \
+        if (!COND) {                                                                               \
+            abort_();                                                                               \
+        }                                                                                          \
+    }
+#endif
+
+#define __clock() clock()
+#define __clock64() clock64()
+
+#endif
+
+#endif
diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
new file mode 100644
index 0000000000..eb3df19bc4
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
@@ -0,0 +1,2045 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
+
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_fp16.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+#define __dparm(x) = x
+#else
+#define __dparm(x)
+#endif
+
+// Add Deprecated Support for CUDA Mapped HIP APIs
+#if defined(__DOXYGEN_ONLY__) || defined(HIP_ENABLE_DEPRECATED)
+#define __HIP_DEPRECATED
+#elif defined(_MSC_VER)
+#define __HIP_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __HIP_DEPRECATED __attribute__((deprecated))
+#else
+#define __HIP_DEPRECATED
+#endif
+
+
+// TODO -move to include/hip_runtime_api.h as a common implementation.
+/**
+ * Memory copy types
+ *
+ */
+typedef enum hipMemcpyKind {
+    hipMemcpyHostToHost,
+    hipMemcpyHostToDevice,
+    hipMemcpyDeviceToHost,
+    hipMemcpyDeviceToDevice,
+    hipMemcpyDefault
+} hipMemcpyKind;
+
+// hipDataType
+#define hipDataType cudaDataType
+#define HIP_R_16F CUDA_R_16F
+#define HIP_R_32F CUDA_R_32F
+#define HIP_R_64F CUDA_R_64F
+#define HIP_C_16F CUDA_C_16F
+#define HIP_C_32F CUDA_C_32F
+#define HIP_C_64F CUDA_C_64F
+
+// hipLibraryPropertyType
+#define hipLibraryPropertyType libraryPropertyType
+#define HIP_LIBRARY_MAJOR_VERSION MAJOR_VERSION
+#define HIP_LIBRARY_MINOR_VERSION MINOR_VERSION
+#define HIP_LIBRARY_PATCH_LEVEL PATCH_LEVEL
+
+#define HIP_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR
+
+//hipArray_Format
+#define HIP_AD_FORMAT_UNSIGNED_INT8   CU_AD_FORMAT_UNSIGNED_INT8
+#define HIP_AD_FORMAT_UNSIGNED_INT16  CU_AD_FORMAT_UNSIGNED_INT16
+#define HIP_AD_FORMAT_UNSIGNED_INT32  CU_AD_FORMAT_UNSIGNED_INT32
+#define HIP_AD_FORMAT_SIGNED_INT8     CU_AD_FORMAT_SIGNED_INT8
+#define HIP_AD_FORMAT_SIGNED_INT16    CU_AD_FORMAT_SIGNED_INT16
+#define HIP_AD_FORMAT_SIGNED_INT32    CU_AD_FORMAT_SIGNED_INT32
+#define HIP_AD_FORMAT_HALF            CU_AD_FORMAT_HALF
+#define HIP_AD_FORMAT_FLOAT           CU_AD_FORMAT_FLOAT
+
+// hipArray_Format
+#define hipArray_Format CUarray_format
+
+inline static CUarray_format hipArray_FormatToCUarray_format(
+    hipArray_Format format) {
+    switch (format) {
+        case HIP_AD_FORMAT_UNSIGNED_INT8:
+            return CU_AD_FORMAT_UNSIGNED_INT8;
+        case HIP_AD_FORMAT_UNSIGNED_INT16:
+            return CU_AD_FORMAT_UNSIGNED_INT16;
+        case HIP_AD_FORMAT_UNSIGNED_INT32:
+            return CU_AD_FORMAT_UNSIGNED_INT32;
+        case HIP_AD_FORMAT_SIGNED_INT8:
+            return CU_AD_FORMAT_SIGNED_INT8;
+        case HIP_AD_FORMAT_SIGNED_INT16:
+            return CU_AD_FORMAT_SIGNED_INT16;
+        case HIP_AD_FORMAT_SIGNED_INT32:
+            return CU_AD_FORMAT_SIGNED_INT32;
+        case HIP_AD_FORMAT_HALF:
+            return CU_AD_FORMAT_HALF;
+        case HIP_AD_FORMAT_FLOAT:
+            return CU_AD_FORMAT_FLOAT;
+        default:
+            return CU_AD_FORMAT_UNSIGNED_INT8;
+    }
+}
+
+#define HIP_TR_ADDRESS_MODE_WRAP   CU_TR_ADDRESS_MODE_WRAP
+#define HIP_TR_ADDRESS_MODE_CLAMP  CU_TR_ADDRESS_MODE_CLAMP
+#define HIP_TR_ADDRESS_MODE_MIRROR CU_TR_ADDRESS_MODE_MIRROR
+#define HIP_TR_ADDRESS_MODE_BORDER CU_TR_ADDRESS_MODE_BORDER
+
+// hipAddress_mode
+#define hipAddress_mode CUaddress_mode
+
+inline static CUaddress_mode hipAddress_modeToCUaddress_mode(
+    hipAddress_mode mode) {
+    switch (mode) {
+        case HIP_TR_ADDRESS_MODE_WRAP:
+            return CU_TR_ADDRESS_MODE_WRAP;
+        case HIP_TR_ADDRESS_MODE_CLAMP:
+            return CU_TR_ADDRESS_MODE_CLAMP;
+        case HIP_TR_ADDRESS_MODE_MIRROR:
+            return CU_TR_ADDRESS_MODE_MIRROR;
+        case HIP_TR_ADDRESS_MODE_BORDER:
+            return CU_TR_ADDRESS_MODE_BORDER;
+        default:
+            return CU_TR_ADDRESS_MODE_WRAP;
+    }
+}
+
+#define HIP_TR_FILTER_MODE_POINT   CU_TR_FILTER_MODE_POINT
+#define HIP_TR_FILTER_MODE_LINEAR  CU_TR_FILTER_MODE_LINEAR
+
+// hipFilter_mode
+#define hipFilter_mode CUfilter_mode
+
+inline static CUfilter_mode hipFilter_mode_enumToCUfilter_mode(
+    hipFilter_mode mode) {
+    switch (mode) {
+        case HIP_TR_FILTER_MODE_POINT:
+            return CU_TR_FILTER_MODE_POINT;
+        case HIP_TR_FILTER_MODE_LINEAR:
+            return CU_TR_FILTER_MODE_LINEAR;
+        default:
+            return CU_TR_FILTER_MODE_POINT;
+    }
+}
+
+//hipResourcetype
+#define HIP_RESOURCE_TYPE_ARRAY            CU_RESOURCE_TYPE_ARRAY
+#define HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY  CU_RESOURCE_TYPE_MIPMAPPED_ARRAY
+#define HIP_RESOURCE_TYPE_LINEAR           CU_RESOURCE_TYPE_LINEAR
+#define HIP_RESOURCE_TYPE_PITCH2D          CU_RESOURCE_TYPE_PITCH2D
+
+// hipResourcetype
+#define hipResourcetype CUresourcetype
+
+inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
+    hipResourcetype resType) {
+    switch (resType) {
+        case HIP_RESOURCE_TYPE_ARRAY:
+            return CU_RESOURCE_TYPE_ARRAY;
+        case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
+            return CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+        case HIP_RESOURCE_TYPE_LINEAR:
+            return CU_RESOURCE_TYPE_LINEAR;
+        case HIP_RESOURCE_TYPE_PITCH2D:
+            return CU_RESOURCE_TYPE_PITCH2D;
+        default:
+            return CU_RESOURCE_TYPE_ARRAY;
+    }
+}
+
+#define hipTexRef CUtexref
+#define hiparray CUarray
+
+// hipTextureAddressMode
+typedef enum cudaTextureAddressMode hipTextureAddressMode;
+#define hipAddressModeWrap cudaAddressModeWrap
+#define hipAddressModeClamp cudaAddressModeClamp
+#define hipAddressModeMirror cudaAddressModeMirror
+#define hipAddressModeBorder cudaAddressModeBorder
+
+// hipTextureFilterMode
+typedef enum cudaTextureFilterMode hipTextureFilterMode;
+#define hipFilterModePoint cudaFilterModePoint
+#define hipFilterModeLinear cudaFilterModeLinear
+
+// hipTextureReadMode
+typedef enum cudaTextureReadMode hipTextureReadMode;
+#define hipReadModeElementType cudaReadModeElementType
+#define hipReadModeNormalizedFloat cudaReadModeNormalizedFloat
+
+// hipChannelFormatKind
+typedef enum cudaChannelFormatKind hipChannelFormatKind;
+#define hipChannelFormatKindSigned      cudaChannelFormatKindSigned
+#define hipChannelFormatKindUnsigned    cudaChannelFormatKindUnsigned
+#define hipChannelFormatKindFloat       cudaChannelFormatKindFloat
+#define hipChannelFormatKindNone        cudaChannelFormatKindNone
+
+#define hipSurfaceBoundaryMode cudaSurfaceBoundaryMode
+#define hipBoundaryModeZero cudaBoundaryModeZero
+#define hipBoundaryModeTrap cudaBoundaryModeTrap
+#define hipBoundaryModeClamp cudaBoundaryModeClamp
+
+// hipFuncCache
+#define hipFuncCachePreferNone cudaFuncCachePreferNone
+#define hipFuncCachePreferShared cudaFuncCachePreferShared
+#define hipFuncCachePreferL1 cudaFuncCachePreferL1
+#define hipFuncCachePreferEqual cudaFuncCachePreferEqual
+
+// hipResourceType
+#define hipResourceType cudaResourceType
+#define hipResourceTypeArray cudaResourceTypeArray
+#define hipResourceTypeMipmappedArray cudaResourceTypeMipmappedArray
+#define hipResourceTypeLinear cudaResourceTypeLinear
+#define hipResourceTypePitch2D cudaResourceTypePitch2D
+//
+// hipErrorNoDevice.
+
+
+//! Flags that can be used with hipEventCreateWithFlags:
+#define hipEventDefault cudaEventDefault
+#define hipEventBlockingSync cudaEventBlockingSync
+#define hipEventDisableTiming cudaEventDisableTiming
+#define hipEventInterprocess cudaEventInterprocess
+#define hipEventReleaseToDevice 0 /* no-op on CUDA platform */
+#define hipEventReleaseToSystem 0 /* no-op on CUDA platform */
+
+
+#define hipHostMallocDefault cudaHostAllocDefault
+#define hipHostMallocPortable cudaHostAllocPortable
+#define hipHostMallocMapped cudaHostAllocMapped
+#define hipHostMallocWriteCombined cudaHostAllocWriteCombined
+#define hipHostMallocCoherent 0x0
+#define hipHostMallocNonCoherent 0x0
+
+#define hipMemAttachGlobal cudaMemAttachGlobal
+#define hipMemAttachHost cudaMemAttachHost
+
+#define hipHostRegisterDefault cudaHostRegisterDefault
+#define hipHostRegisterPortable cudaHostRegisterPortable
+#define hipHostRegisterMapped cudaHostRegisterMapped
+#define hipHostRegisterIoMemory cudaHostRegisterIoMemory
+
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE
+#define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END
+#define hipLimitMallocHeapSize cudaLimitMallocHeapSize
+#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess
+
+#define hipOccupancyDefault cudaOccupancyDefault
+
+#define hipCooperativeLaunchMultiDeviceNoPreSync    \
+        cudaCooperativeLaunchMultiDeviceNoPreSync
+#define hipCooperativeLaunchMultiDeviceNoPostSync   \
+        cudaCooperativeLaunchMultiDeviceNoPostSync
+
+
+// enum CUjit_option redefines
+#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS
+#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK
+#define hipJitOptionWallTime CU_JIT_WALL_TIME
+#define hipJitOptionInfoLogBuffer CU_JIT_INFO_LOG_BUFFER
+#define hipJitOptionInfoLogBufferSizeBytes CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionErrorLogBuffer CU_JIT_ERROR_LOG_BUFFER
+#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionOptimizationLevel CU_JIT_OPTIMIZATION_LEVEL
+#define hipJitOptionTargetFromContext CU_JIT_TARGET_FROM_CUCONTEXT
+#define hipJitOptionTarget CU_JIT_TARGET
+#define hipJitOptionFallbackStrategy CU_JIT_FALLBACK_STRATEGY
+#define hipJitOptionGenerateDebugInfo CU_JIT_GENERATE_DEBUG_INFO
+#define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE
+#define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO
+#define hipJitOptionCacheMode CU_JIT_CACHE_MODE
+#define hipJitOptionSm3xOpt CU_JIT_NEW_SM3X_OPT
+#define hipJitOptionFastCompile CU_JIT_FAST_COMPILE
+#define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS
+
+typedef cudaEvent_t hipEvent_t;
+typedef cudaStream_t hipStream_t;
+typedef cudaIpcEventHandle_t hipIpcEventHandle_t;
+typedef cudaIpcMemHandle_t hipIpcMemHandle_t;
+typedef enum cudaLimit hipLimit_t;
+typedef enum cudaFuncAttribute hipFuncAttribute;
+typedef enum cudaFuncCache hipFuncCache_t;
+typedef CUcontext hipCtx_t;
+typedef enum cudaSharedMemConfig hipSharedMemConfig;
+typedef CUfunc_cache hipFuncCache;
+typedef CUjit_option hipJitOption;
+typedef CUdevice hipDevice_t;
+typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr;
+#define hipDevP2PAttrPerformanceRank cudaDevP2PAttrPerformanceRank
+#define hipDevP2PAttrAccessSupported cudaDevP2PAttrAccessSupported
+#define hipDevP2PAttrNativeAtomicSupported cudaDevP2PAttrNativeAtomicSupported
+#define hipDevP2PAttrHipArrayAccessSupported cudaDevP2PAttrCudaArrayAccessSupported
+#define hipFuncAttributeMaxDynamicSharedMemorySize cudaFuncAttributeMaxDynamicSharedMemorySize
+#define hipFuncAttributePreferredSharedMemoryCarveout cudaFuncAttributePreferredSharedMemoryCarveout
+
+typedef CUmodule hipModule_t;
+typedef CUfunction hipFunction_t;
+typedef CUdeviceptr hipDeviceptr_t;
+typedef struct cudaArray hipArray;
+typedef struct cudaArray* hipArray_t;
+typedef struct cudaArray* hipArray_const_t;
+typedef struct cudaFuncAttributes hipFuncAttributes;
+typedef struct cudaLaunchParams hipLaunchParams;
+#define hipFunction_attribute CUfunction_attribute
+#define hip_Memcpy2D CUDA_MEMCPY2D
+#define hipMemcpy3DParms cudaMemcpy3DParms
+#define hipArrayDefault cudaArrayDefault
+#define hipArrayLayered cudaArrayLayered
+#define hipArraySurfaceLoadStore cudaArraySurfaceLoadStore
+#define hipArrayCubemap cudaArrayCubemap
+#define hipArrayTextureGather cudaArrayTextureGather
+
+typedef cudaTextureObject_t hipTextureObject_t;
+typedef cudaSurfaceObject_t hipSurfaceObject_t;
+#define hipTextureType1D cudaTextureType1D
+#define hipTextureType1DLayered cudaTextureType1DLayered
+#define hipTextureType2D cudaTextureType2D
+#define hipTextureType2DLayered cudaTextureType2DLayered
+#define hipTextureType3D cudaTextureType3D
+#define hipDeviceMapHost cudaDeviceMapHost
+
+typedef struct cudaExtent hipExtent;
+typedef struct cudaPitchedPtr hipPitchedPtr;
+#define make_hipExtent make_cudaExtent
+#define make_hipPos make_cudaPos
+#define make_hipPitchedPtr make_cudaPitchedPtr
+// Flags that can be used with hipStreamCreateWithFlags
+#define hipStreamDefault cudaStreamDefault
+#define hipStreamNonBlocking cudaStreamNonBlocking
+
+typedef struct cudaChannelFormatDesc hipChannelFormatDesc;
+typedef struct cudaResourceDesc hipResourceDesc;
+typedef struct cudaTextureDesc hipTextureDesc;
+typedef struct cudaResourceViewDesc hipResourceViewDesc;
+// adding code for hipmemSharedConfig
+#define hipSharedMemBankSizeDefault cudaSharedMemBankSizeDefault
+#define hipSharedMemBankSizeFourByte cudaSharedMemBankSizeFourByte
+#define hipSharedMemBankSizeEightByte cudaSharedMemBankSizeEightByte
+
+//Function Attributes
+#define HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+#define HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_NUM_REGS CU_FUNC_ATTRIBUTE_NUM_REGS
+#define HIP_FUNC_ATTRIBUTE_PTX_VERSION CU_FUNC_ATTRIBUTE_PTX_VERSION
+#define HIP_FUNC_ATTRIBUTE_BINARY_VERSION CU_FUNC_ATTRIBUTE_BINARY_VERSION
+#define HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA CU_FUNC_ATTRIBUTE_CACHE_MODE_CA
+#define HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
+#define HIP_FUNC_ATTRIBUTE_MAX CU_FUNC_ATTRIBUTE_MAX
+
+#if CUDA_VERSION >= 9000
+#define __shfl(...)      __shfl_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_up(...)   __shfl_up_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_down(...) __shfl_down_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_xor(...)  __shfl_xor_sync(0xffffffff, __VA_ARGS__)
+#endif // CUDA_VERSION >= 9000
+
+inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
+    switch (cuError) {
+        case cudaSuccess:
+            return hipSuccess;
+        case cudaErrorProfilerDisabled:
+            return hipErrorProfilerDisabled;
+        case cudaErrorProfilerNotInitialized:
+            return hipErrorProfilerNotInitialized;
+        case cudaErrorProfilerAlreadyStarted:
+            return hipErrorProfilerAlreadyStarted;
+        case cudaErrorProfilerAlreadyStopped:
+            return hipErrorProfilerAlreadyStopped;
+        case cudaErrorInsufficientDriver:
+            return hipErrorInsufficientDriver;
+        case cudaErrorUnsupportedLimit:
+            return hipErrorUnsupportedLimit;
+        case cudaErrorPeerAccessUnsupported:
+            return hipErrorPeerAccessUnsupported;
+        case cudaErrorInvalidGraphicsContext:
+            return hipErrorInvalidGraphicsContext;
+        case cudaErrorSharedObjectSymbolNotFound:
+            return hipErrorSharedObjectSymbolNotFound;
+        case cudaErrorSharedObjectInitFailed:
+            return hipErrorSharedObjectInitFailed;
+        case cudaErrorOperatingSystem:
+            return hipErrorOperatingSystem;
+        case cudaErrorSetOnActiveProcess:
+            return hipErrorSetOnActiveProcess;
+        case cudaErrorIllegalAddress:
+            return hipErrorIllegalAddress;
+        case cudaErrorInvalidSymbol:
+            return hipErrorInvalidSymbol;
+        case cudaErrorMissingConfiguration:
+            return hipErrorMissingConfiguration;
+        case cudaErrorMemoryAllocation:
+            return hipErrorOutOfMemory;
+        case cudaErrorInitializationError:
+            return hipErrorNotInitialized;
+        case cudaErrorLaunchFailure:
+            return hipErrorLaunchFailure;
+        case cudaErrorCooperativeLaunchTooLarge:
+            return hipErrorCooperativeLaunchTooLarge;
+        case cudaErrorPriorLaunchFailure:
+            return hipErrorPriorLaunchFailure;
+        case cudaErrorLaunchOutOfResources:
+            return hipErrorLaunchOutOfResources;
+        case cudaErrorInvalidDeviceFunction:
+            return hipErrorInvalidDeviceFunction;
+        case cudaErrorInvalidConfiguration:
+            return hipErrorInvalidConfiguration;
+        case cudaErrorInvalidDevice:
+            return hipErrorInvalidDevice;
+        case cudaErrorInvalidValue:
+            return hipErrorInvalidValue;
+        case cudaErrorInvalidDevicePointer:
+            return hipErrorInvalidDevicePointer;
+        case cudaErrorInvalidMemcpyDirection:
+            return hipErrorInvalidMemcpyDirection;
+        case cudaErrorInvalidResourceHandle:
+            return hipErrorInvalidHandle;
+        case cudaErrorNotReady:
+            return hipErrorNotReady;
+        case cudaErrorNoDevice:
+            return hipErrorNoDevice;
+        case cudaErrorPeerAccessAlreadyEnabled:
+            return hipErrorPeerAccessAlreadyEnabled;
+        case cudaErrorPeerAccessNotEnabled:
+            return hipErrorPeerAccessNotEnabled;
+        case cudaErrorHostMemoryAlreadyRegistered:
+            return hipErrorHostMemoryAlreadyRegistered;
+        case cudaErrorHostMemoryNotRegistered:
+            return hipErrorHostMemoryNotRegistered;
+        case cudaErrorMapBufferObjectFailed:
+            return hipErrorMapFailed;
+        case cudaErrorAssert:
+            return hipErrorAssert;
+        case cudaErrorNotSupported:
+            return hipErrorNotSupported;
+        case cudaErrorCudartUnloading:
+            return hipErrorDeinitialized;
+        case cudaErrorInvalidKernelImage:
+            return hipErrorInvalidImage;
+        case cudaErrorUnmapBufferObjectFailed:
+            return hipErrorUnmapFailed;
+        case cudaErrorNoKernelImageForDevice:
+            return hipErrorNoBinaryForGpu;
+        case cudaErrorECCUncorrectable:
+            return hipErrorECCNotCorrectable;
+        case cudaErrorDeviceAlreadyInUse:
+            return hipErrorContextAlreadyInUse;
+        case cudaErrorInvalidPtx:
+            return hipErrorInvalidKernelFile;
+        case cudaErrorLaunchTimeout:
+            return hipErrorLaunchTimeOut;
+#if CUDA_VERSION >= 10010
+        case cudaErrorInvalidSource:
+            return hipErrorInvalidSource;
+        case cudaErrorFileNotFound:
+            return hipErrorFileNotFound;
+        case cudaErrorSymbolNotFound:
+            return hipErrorNotFound;
+        case cudaErrorArrayIsMapped:
+            return hipErrorArrayIsMapped;
+        case cudaErrorNotMappedAsPointer:
+            return hipErrorNotMappedAsPointer;
+        case cudaErrorNotMappedAsArray:
+            return hipErrorNotMappedAsArray;
+        case cudaErrorNotMapped:
+            return hipErrorNotMapped;
+        case cudaErrorAlreadyAcquired:
+            return hipErrorAlreadyAcquired;
+        case cudaErrorAlreadyMapped:
+            return hipErrorAlreadyMapped;
+#endif
+#if CUDA_VERSION >= 10020
+        case cudaErrorDeviceUninitialized:
+            return hipErrorInvalidContext;
+#endif
+        case cudaErrorUnknown:
+        default:
+            return hipErrorUnknown;  // Note - translated error.
+    }
+}
+
+inline static hipError_t hipCUResultTohipError(CUresult cuError) {
+    switch (cuError) {
+        case CUDA_SUCCESS:
+            return hipSuccess;
+        case CUDA_ERROR_OUT_OF_MEMORY:
+            return hipErrorOutOfMemory;
+        case CUDA_ERROR_INVALID_VALUE:
+            return hipErrorInvalidValue;
+        case CUDA_ERROR_INVALID_DEVICE:
+            return hipErrorInvalidDevice;
+        case CUDA_ERROR_DEINITIALIZED:
+            return hipErrorDeinitialized;
+        case CUDA_ERROR_NO_DEVICE:
+            return hipErrorNoDevice;
+        case CUDA_ERROR_INVALID_CONTEXT:
+            return hipErrorInvalidContext;
+        case CUDA_ERROR_NOT_INITIALIZED:
+            return hipErrorNotInitialized;
+        case CUDA_ERROR_INVALID_HANDLE:
+            return hipErrorInvalidHandle;
+        case CUDA_ERROR_MAP_FAILED:
+            return hipErrorMapFailed;
+        case CUDA_ERROR_PROFILER_DISABLED:
+            return hipErrorProfilerDisabled;
+        case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+            return hipErrorProfilerNotInitialized;
+        case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+            return hipErrorProfilerAlreadyStarted;
+        case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+            return hipErrorProfilerAlreadyStopped;
+        case CUDA_ERROR_INVALID_IMAGE:
+            return hipErrorInvalidImage;
+        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+            return hipErrorContextAlreadyCurrent;
+        case CUDA_ERROR_UNMAP_FAILED:
+            return hipErrorUnmapFailed;
+        case CUDA_ERROR_ARRAY_IS_MAPPED:
+            return hipErrorArrayIsMapped;
+        case CUDA_ERROR_ALREADY_MAPPED:
+            return hipErrorAlreadyMapped;
+        case CUDA_ERROR_NO_BINARY_FOR_GPU:
+            return hipErrorNoBinaryForGpu;
+        case CUDA_ERROR_ALREADY_ACQUIRED:
+            return hipErrorAlreadyAcquired;
+        case CUDA_ERROR_NOT_MAPPED:
+            return hipErrorNotMapped;
+        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+            return hipErrorNotMappedAsArray;
+        case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+            return hipErrorNotMappedAsPointer;
+        case CUDA_ERROR_ECC_UNCORRECTABLE:
+            return hipErrorECCNotCorrectable;
+        case CUDA_ERROR_UNSUPPORTED_LIMIT:
+            return hipErrorUnsupportedLimit;
+        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+            return hipErrorContextAlreadyInUse;
+        case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
+            return hipErrorPeerAccessUnsupported;
+        case CUDA_ERROR_INVALID_PTX:
+            return hipErrorInvalidKernelFile;
+        case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
+            return hipErrorInvalidGraphicsContext;
+        case CUDA_ERROR_INVALID_SOURCE:
+            return hipErrorInvalidSource;
+        case CUDA_ERROR_FILE_NOT_FOUND:
+            return hipErrorFileNotFound;
+        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+            return hipErrorSharedObjectSymbolNotFound;
+        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+            return hipErrorSharedObjectInitFailed;
+        case CUDA_ERROR_OPERATING_SYSTEM:
+            return hipErrorOperatingSystem;
+        case CUDA_ERROR_NOT_FOUND:
+            return hipErrorNotFound;
+        case CUDA_ERROR_NOT_READY:
+            return hipErrorNotReady;
+        case CUDA_ERROR_ILLEGAL_ADDRESS:
+            return hipErrorIllegalAddress;
+        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+            return hipErrorLaunchOutOfResources;
+        case CUDA_ERROR_LAUNCH_TIMEOUT:
+            return hipErrorLaunchTimeOut;
+        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+            return hipErrorPeerAccessAlreadyEnabled;
+        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+            return hipErrorPeerAccessNotEnabled;
+        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+            return hipErrorSetOnActiveProcess;
+        case CUDA_ERROR_ASSERT:
+            return hipErrorAssert;
+        case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+            return hipErrorHostMemoryAlreadyRegistered;
+        case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+            return hipErrorHostMemoryNotRegistered;
+        case CUDA_ERROR_LAUNCH_FAILED:
+            return hipErrorLaunchFailure;
+        case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE:
+            return hipErrorCooperativeLaunchTooLarge;
+        case CUDA_ERROR_NOT_SUPPORTED:
+            return hipErrorNotSupported;
+        case CUDA_ERROR_UNKNOWN:
+        default:
+            return hipErrorUnknown;  // Note - translated error.
+    }
+}
+
+inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
+    switch (hError) {
+        case hipSuccess:
+            return cudaSuccess;
+        case hipErrorOutOfMemory:
+            return cudaErrorMemoryAllocation;
+        case hipErrorProfilerDisabled:
+          return cudaErrorProfilerDisabled;
+        case hipErrorProfilerNotInitialized:
+            return cudaErrorProfilerNotInitialized;
+        case hipErrorProfilerAlreadyStarted:
+            return cudaErrorProfilerAlreadyStarted;
+        case hipErrorProfilerAlreadyStopped:
+            return cudaErrorProfilerAlreadyStopped;
+        case hipErrorInvalidConfiguration:
+            return cudaErrorInvalidConfiguration;
+        case hipErrorLaunchOutOfResources:
+            return cudaErrorLaunchOutOfResources;
+        case hipErrorInvalidValue:
+            return cudaErrorInvalidValue;
+        case hipErrorInvalidHandle:
+            return cudaErrorInvalidResourceHandle;
+        case hipErrorInvalidDevice:
+            return cudaErrorInvalidDevice;
+        case hipErrorInvalidMemcpyDirection:
+            return cudaErrorInvalidMemcpyDirection;
+        case hipErrorInvalidDevicePointer:
+            return cudaErrorInvalidDevicePointer;
+        case hipErrorNotInitialized:
+            return cudaErrorInitializationError;
+        case hipErrorNoDevice:
+            return cudaErrorNoDevice;
+        case hipErrorNotReady:
+            return cudaErrorNotReady;
+        case hipErrorPeerAccessNotEnabled:
+            return cudaErrorPeerAccessNotEnabled;
+        case hipErrorPeerAccessAlreadyEnabled:
+            return cudaErrorPeerAccessAlreadyEnabled;
+        case hipErrorHostMemoryAlreadyRegistered:
+            return cudaErrorHostMemoryAlreadyRegistered;
+        case hipErrorHostMemoryNotRegistered:
+            return cudaErrorHostMemoryNotRegistered;
+        case hipErrorDeinitialized:
+            return cudaErrorCudartUnloading;
+        case hipErrorInvalidSymbol:
+            return cudaErrorInvalidSymbol;
+        case hipErrorInsufficientDriver:
+            return cudaErrorInsufficientDriver;
+        case hipErrorMissingConfiguration:
+            return cudaErrorMissingConfiguration;
+        case hipErrorPriorLaunchFailure:
+            return cudaErrorPriorLaunchFailure;
+        case hipErrorInvalidDeviceFunction:
+            return cudaErrorInvalidDeviceFunction;
+        case hipErrorInvalidImage:
+            return cudaErrorInvalidKernelImage;
+        case hipErrorInvalidContext:
+#if CUDA_VERSION >= 10020
+            return cudaErrorDeviceUninitialized;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorMapFailed:
+            return cudaErrorMapBufferObjectFailed;
+        case hipErrorUnmapFailed:
+            return cudaErrorUnmapBufferObjectFailed;
+        case hipErrorArrayIsMapped:
+#if CUDA_VERSION >= 10010
+            return cudaErrorArrayIsMapped;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorAlreadyMapped:
+#if CUDA_VERSION >= 10010
+            return cudaErrorAlreadyMapped;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNoBinaryForGpu:
+            return cudaErrorNoKernelImageForDevice;
+        case hipErrorAlreadyAcquired:
+#if CUDA_VERSION >= 10010
+            return cudaErrorAlreadyAcquired;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNotMapped:
+#if CUDA_VERSION >= 10010
+            return cudaErrorNotMapped;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNotMappedAsArray:
+#if CUDA_VERSION >= 10010
+            return cudaErrorNotMappedAsArray;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNotMappedAsPointer:
+#if CUDA_VERSION >= 10010
+            return cudaErrorNotMappedAsPointer;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorECCNotCorrectable:
+            return cudaErrorECCUncorrectable;
+        case hipErrorUnsupportedLimit:
+            return cudaErrorUnsupportedLimit;
+        case hipErrorContextAlreadyInUse:
+            return cudaErrorDeviceAlreadyInUse;
+        case hipErrorPeerAccessUnsupported:
+            return cudaErrorPeerAccessUnsupported;
+        case hipErrorInvalidKernelFile:
+            return cudaErrorInvalidPtx;
+        case hipErrorInvalidGraphicsContext:
+            return cudaErrorInvalidGraphicsContext;
+        case hipErrorInvalidSource:
+#if CUDA_VERSION >= 10010
+            return cudaErrorInvalidSource;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorFileNotFound:
+#if CUDA_VERSION >= 10010
+            return cudaErrorFileNotFound;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorSharedObjectSymbolNotFound:
+            return cudaErrorSharedObjectSymbolNotFound;
+        case hipErrorSharedObjectInitFailed:
+            return cudaErrorSharedObjectInitFailed;
+        case hipErrorOperatingSystem:
+            return cudaErrorOperatingSystem;
+        case hipErrorNotFound:
+#if CUDA_VERSION >= 10010
+            return cudaErrorSymbolNotFound;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorIllegalAddress:
+            return cudaErrorIllegalAddress;
+        case hipErrorLaunchTimeOut:
+            return cudaErrorLaunchTimeout;
+        case hipErrorSetOnActiveProcess:
+            return cudaErrorSetOnActiveProcess;
+        case hipErrorLaunchFailure:
+            return cudaErrorLaunchFailure;
+        case hipErrorCooperativeLaunchTooLarge:
+            return cudaErrorCooperativeLaunchTooLarge;
+        case hipErrorNotSupported:
+            return cudaErrorNotSupported;
+        // HSA: does not exist in CUDA
+        case hipErrorRuntimeMemory:
+        // HSA: does not exist in CUDA
+        case hipErrorRuntimeOther:
+        case hipErrorUnknown:
+        case hipErrorTbd:
+        default:
+            return cudaErrorUnknown;  // Note - translated error.
+    }
+}
+
+inline static enum cudaMemcpyKind hipMemcpyKindToCudaMemcpyKind(hipMemcpyKind kind) {
+    switch (kind) {
+        case hipMemcpyHostToHost:
+            return cudaMemcpyHostToHost;
+        case hipMemcpyHostToDevice:
+            return cudaMemcpyHostToDevice;
+        case hipMemcpyDeviceToHost:
+            return cudaMemcpyDeviceToHost;
+        case hipMemcpyDeviceToDevice:
+            return cudaMemcpyDeviceToDevice;
+        default:
+            return cudaMemcpyDefault;
+    }
+}
+
+inline static enum cudaTextureAddressMode hipTextureAddressModeToCudaTextureAddressMode(
+    hipTextureAddressMode kind) {
+    switch (kind) {
+        case hipAddressModeWrap:
+            return cudaAddressModeWrap;
+        case hipAddressModeClamp:
+            return cudaAddressModeClamp;
+        case hipAddressModeMirror:
+            return cudaAddressModeMirror;
+        case hipAddressModeBorder:
+            return cudaAddressModeBorder;
+        default:
+            return cudaAddressModeWrap;
+    }
+}
+
+inline static enum cudaTextureFilterMode hipTextureFilterModeToCudaTextureFilterMode(
+    hipTextureFilterMode kind) {
+    switch (kind) {
+        case hipFilterModePoint:
+            return cudaFilterModePoint;
+        case hipFilterModeLinear:
+            return cudaFilterModeLinear;
+        default:
+            return cudaFilterModePoint;
+    }
+}
+
+inline static enum cudaTextureReadMode hipTextureReadModeToCudaTextureReadMode(hipTextureReadMode kind) {
+    switch (kind) {
+        case hipReadModeElementType:
+            return cudaReadModeElementType;
+        case hipReadModeNormalizedFloat:
+            return cudaReadModeNormalizedFloat;
+        default:
+            return cudaReadModeElementType;
+    }
+}
+
+inline static enum cudaChannelFormatKind hipChannelFormatKindToCudaChannelFormatKind(
+    hipChannelFormatKind kind) {
+    switch (kind) {
+        case hipChannelFormatKindSigned:
+            return cudaChannelFormatKindSigned;
+        case hipChannelFormatKindUnsigned:
+            return cudaChannelFormatKindUnsigned;
+        case hipChannelFormatKindFloat:
+            return cudaChannelFormatKindFloat;
+        case hipChannelFormatKindNone:
+            return cudaChannelFormatKindNone;
+        default:
+            return cudaChannelFormatKindNone;
+    }
+}
+
+/**
+ * Stream CallBack struct
+ */
+#define HIPRT_CB CUDART_CB
+typedef void(HIPRT_CB* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData);
+inline static hipError_t hipInit(unsigned int flags) {
+    return hipCUResultTohipError(cuInit(flags));
+}
+
+inline static hipError_t hipDeviceReset() { return hipCUDAErrorTohipError(cudaDeviceReset()); }
+
+inline static hipError_t hipGetLastError() { return hipCUDAErrorTohipError(cudaGetLastError()); }
+
+inline static hipError_t hipPeekAtLastError() {
+    return hipCUDAErrorTohipError(cudaPeekAtLastError());
+}
+
+inline static hipError_t hipMalloc(void** ptr, size_t size) {
+    return hipCUDAErrorTohipError(cudaMalloc(ptr, size));
+}
+
+inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) {
+    return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height));
+}
+
+inline static hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr,size_t* pitch,size_t widthInBytes,size_t height,unsigned int elementSizeBytes){
+    return hipCUResultTohipError(cuMemAllocPitch(dptr,pitch,widthInBytes,height,elementSizeBytes));
+}
+
+inline static hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent) {
+    return hipCUDAErrorTohipError(cudaMalloc3D(pitchedDevPtr, extent));
+}
+
+inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); }
+
+inline static hipError_t hipMallocHost(void** ptr, size_t size)
+    __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipMallocHost(void** ptr, size_t size) {
+    return hipCUDAErrorTohipError(cudaMallocHost(ptr, size));
+}
+
+inline static hipError_t hipMemAllocHost(void** ptr, size_t size)
+    __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipMemAllocHost(void** ptr, size_t size) {
+    return hipCUResultTohipError(cuMemAllocHost(ptr, size));
+}
+
+inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags)
+    __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
+}
+
+inline static hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
+}
+
+inline static hipError_t hipMallocManaged(void** ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaMallocManaged(ptr, size, flags));
+}
+
+inline static hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
+                                        size_t width, size_t height,
+                                        unsigned int flags __dparm(hipArrayDefault)) {
+    return hipCUDAErrorTohipError(cudaMallocArray(array, desc, width, height, flags));
+}
+
+inline static hipError_t hipMalloc3DArray(hipArray** array, const hipChannelFormatDesc* desc,
+                             hipExtent extent, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaMalloc3DArray(array, desc, extent, flags));
+}
+
+inline static hipError_t hipFreeArray(hipArray* array) {
+    return hipCUDAErrorTohipError(cudaFreeArray(array));
+}
+
+inline static hipError_t hipHostGetDevicePointer(void** devPtr, void* hostPtr, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostGetDevicePointer(devPtr, hostPtr, flags));
+}
+
+inline static hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) {
+    return hipCUDAErrorTohipError(cudaHostGetFlags(flagsPtr, hostPtr));
+}
+
+inline static hipError_t hipHostRegister(void* ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostRegister(ptr, size, flags));
+}
+
+inline static hipError_t hipHostUnregister(void* ptr) {
+    return hipCUDAErrorTohipError(cudaHostUnregister(ptr));
+}
+
+inline static hipError_t hipFreeHost(void* ptr)
+    __attribute__((deprecated("use hipHostFree instead")));
+inline static hipError_t hipFreeHost(void* ptr) {
+    return hipCUDAErrorTohipError(cudaFreeHost(ptr));
+}
+
+inline static hipError_t hipHostFree(void* ptr) {
+    return hipCUDAErrorTohipError(cudaFreeHost(ptr));
+}
+
+inline static hipError_t hipSetDevice(int device) {
+    return hipCUDAErrorTohipError(cudaSetDevice(device));
+}
+
+inline static hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop) {
+    struct cudaDeviceProp cdprop;
+    memset(&cdprop, 0x0, sizeof(struct cudaDeviceProp));
+    cdprop.major = prop->major;
+    cdprop.minor = prop->minor;
+    cdprop.totalGlobalMem = prop->totalGlobalMem;
+    cdprop.sharedMemPerBlock = prop->sharedMemPerBlock;
+    cdprop.regsPerBlock = prop->regsPerBlock;
+    cdprop.warpSize = prop->warpSize;
+    cdprop.maxThreadsPerBlock = prop->maxThreadsPerBlock;
+    cdprop.clockRate = prop->clockRate;
+    cdprop.totalConstMem = prop->totalConstMem;
+    cdprop.multiProcessorCount = prop->multiProcessorCount;
+    cdprop.l2CacheSize = prop->l2CacheSize;
+    cdprop.maxThreadsPerMultiProcessor = prop->maxThreadsPerMultiProcessor;
+    cdprop.computeMode = prop->computeMode;
+    cdprop.canMapHostMemory = prop->canMapHostMemory;
+    cdprop.memoryClockRate = prop->memoryClockRate;
+    cdprop.memoryBusWidth = prop->memoryBusWidth;
+    return hipCUDAErrorTohipError(cudaChooseDevice(device, &cdprop));
+}
+
+inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t size) {
+    return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t size) {
+    return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t size) {
+    return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t size,
+                                            hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpyHtoDAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t size,
+                                            hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpyDtoHAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t size,
+                                            hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpyDtoDAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
+                                   hipMemcpyKind copyKind) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
+}
+
+
+inline static hipError_t hipMemcpyWithStream(void* dst, const void* src,
+				      size_t sizeBytes, hipMemcpyKind copyKind,
+				      hipStream_t stream) {
+	cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes, 
+										hipMemcpyKindToCudaMemcpyKind(copyKind),
+										stream);
+	
+	if (error != cudaSuccess) return hipCUDAErrorTohipError(error);
+	
+	return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
+}
+
+inline static hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
+                                        hipMemcpyKind copyKind, hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpyAsync(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind), stream));
+}
+
+inline static hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeBytes,
+                                           size_t offset __dparm(0),
+                                           hipMemcpyKind copyType __dparm(hipMemcpyHostToDevice)) {
+    return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset,
+                                                     hipMemcpyKindToCudaMemcpyKind(copyType)));
+}
+
+inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src,
+                                                size_t sizeBytes, size_t offset,
+                                                hipMemcpyKind copyType,
+                                                hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(
+        symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType), stream));
+}
+
+inline static hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t sizeBytes,
+                                             size_t offset __dparm(0),
+                                             hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
+    return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset,
+                                                       hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName,
+                                                  size_t sizeBytes, size_t offset,
+                                                  hipMemcpyKind kind,
+                                                  hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemcpyFromSymbolAsync(
+        dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind), stream));
+}
+
+inline static hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
+    return hipCUDAErrorTohipError(cudaGetSymbolAddress(devPtr, symbolName));
+}
+
+inline static hipError_t hipGetSymbolSize(size_t* size, const void* symbolName) {
+    return hipCUDAErrorTohipError(cudaGetSymbolSize(size, symbolName));
+}
+
+inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
+                                     size_t width, size_t height, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
+  return hipCUResultTohipError(cuMemcpy2D(pCopy));
+}
+
+inline static hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0)) {
+  return hipCUResultTohipError(cuMemcpy2DAsync(pCopy, stream));
+}
+
+inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
+{
+    return hipCUDAErrorTohipError(cudaMemcpy3D(p));
+}
+
+inline static hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms *p, hipStream_t stream)
+{
+    return hipCUDAErrorTohipError(cudaMemcpy3DAsync(p, stream));
+}
+
+inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch,
+                                          size_t width, size_t height, hipMemcpyKind kind,
+                                          hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height,
+                                                    hipMemcpyKindToCudaMemcpyKind(kind), stream));
+}
+
+inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset,
+                                            const void* src, size_t spitch, size_t width,
+                                            size_t height, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width,
+                                                      height, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset,
+                                                           size_t hOffset, const void* src,
+                                                           size_t count, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray,
+                                                             size_t wOffset, size_t hOffset,
+                                                             size_t count, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count,
+                                                      hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset,
+                                       size_t count) {
+    return hipCUResultTohipError(cuMemcpyAtoH(dst, (CUarray)srcArray, srcOffset, count));
+}
+
+inline static hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost,
+                                       size_t count) {
+    return hipCUResultTohipError(cuMemcpyHtoA((CUarray)dstArray, dstOffset, srcHost, count));
+}
+
+inline static hipError_t hipDeviceSynchronize() {
+    return hipCUDAErrorTohipError(cudaDeviceSynchronize());
+}
+
+inline static hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* pCacheConfig) {
+    return hipCUDAErrorTohipError(cudaDeviceGetCacheConfig(pCacheConfig));
+}
+
+inline static hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value) {
+    return hipCUDAErrorTohipError(cudaFuncSetAttribute(func, attr, value));
+}
+
+inline static hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig) {
+    return hipCUDAErrorTohipError(cudaDeviceSetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config) {
+    return hipCUDAErrorTohipError(cudaFuncSetSharedMemConfig(func, config));
+}
+
+inline static const char* hipGetErrorString(hipError_t error) {
+    return cudaGetErrorString(hipErrorToCudaError(error));
+}
+
+inline static const char* hipGetErrorName(hipError_t error) {
+    return cudaGetErrorName(hipErrorToCudaError(error));
+}
+
+inline static hipError_t hipGetDeviceCount(int* count) {
+    return hipCUDAErrorTohipError(cudaGetDeviceCount(count));
+}
+
+inline static hipError_t hipGetDevice(int* device) {
+    return hipCUDAErrorTohipError(cudaGetDevice(device));
+}
+
+inline static hipError_t hipIpcCloseMemHandle(void* devPtr) {
+    return hipCUDAErrorTohipError(cudaIpcCloseMemHandle(devPtr));
+}
+
+inline static hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaIpcGetEventHandle(handle, event));
+}
+
+inline static hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr) {
+    return hipCUDAErrorTohipError(cudaIpcGetMemHandle(handle, devPtr));
+}
+
+inline static hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle) {
+    return hipCUDAErrorTohipError(cudaIpcOpenEventHandle(event, handle));
+}
+
+inline static hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle,
+                                             unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaIpcOpenMemHandle(devPtr, handle, flags));
+}
+
+inline static hipError_t hipMemset(void* devPtr, int value, size_t count) {
+    return hipCUDAErrorTohipError(cudaMemset(devPtr, value, count));
+}
+
+inline static hipError_t hipMemsetD32(hipDeviceptr_t devPtr, int value, size_t count) {
+    return hipCUResultTohipError(cuMemsetD32(devPtr, value, count));
+}
+
+inline static hipError_t hipMemsetAsync(void* devPtr, int value, size_t count,
+                                        hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemsetAsync(devPtr, value, count, stream));
+}
+
+inline static hipError_t hipMemsetD32Async(hipDeviceptr_t devPtr, int value, size_t count,
+                                           hipStream_t stream __dparm(0)) {
+    return hipCUResultTohipError(cuMemsetD32Async(devPtr, value, count, stream));
+}
+
+inline static hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes) {
+    return hipCUResultTohipError(cuMemsetD8(dest, value, sizeBytes));
+}
+
+inline static hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes,
+                                          hipStream_t stream __dparm(0)) {
+    return hipCUResultTohipError(cuMemsetD8Async(dest, value, sizeBytes, stream));
+}
+
+inline static hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes) {
+    return hipCUResultTohipError(cuMemsetD16(dest, value, sizeBytes));
+}
+
+inline static hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes,
+                                           hipStream_t stream __dparm(0)) {
+    return hipCUResultTohipError(cuMemsetD16Async(dest, value, sizeBytes, stream));
+}
+
+inline static hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height) {
+    return hipCUDAErrorTohipError(cudaMemset2D(dst, pitch, value, width, height));
+}
+
+inline static hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height, hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemset2DAsync(dst, pitch, value, width, height, stream));
+}
+
+inline static hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent ){
+    return hipCUDAErrorTohipError(cudaMemset3D(pitchedDevPtr, value, extent));
+}
+
+inline static hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent, hipStream_t stream __dparm(0) ){
+    return hipCUDAErrorTohipError(cudaMemset3DAsync(pitchedDevPtr, value, extent, stream));
+}
+
+inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int device) {
+    struct cudaDeviceProp cdprop;
+    cudaError_t cerror;
+    cerror = cudaGetDeviceProperties(&cdprop, device);
+
+    strncpy(p_prop->name, cdprop.name, 256);
+    p_prop->totalGlobalMem = cdprop.totalGlobalMem;
+    p_prop->sharedMemPerBlock = cdprop.sharedMemPerBlock;
+    p_prop->regsPerBlock = cdprop.regsPerBlock;
+    p_prop->warpSize = cdprop.warpSize;
+    p_prop->maxThreadsPerBlock = cdprop.maxThreadsPerBlock;
+    for (int i = 0; i < 3; i++) {
+        p_prop->maxThreadsDim[i] = cdprop.maxThreadsDim[i];
+        p_prop->maxGridSize[i] = cdprop.maxGridSize[i];
+    }
+    p_prop->clockRate = cdprop.clockRate;
+    p_prop->memoryClockRate = cdprop.memoryClockRate;
+    p_prop->memoryBusWidth = cdprop.memoryBusWidth;
+    p_prop->totalConstMem = cdprop.totalConstMem;
+    p_prop->major = cdprop.major;
+    p_prop->minor = cdprop.minor;
+    p_prop->multiProcessorCount = cdprop.multiProcessorCount;
+    p_prop->l2CacheSize = cdprop.l2CacheSize;
+    p_prop->maxThreadsPerMultiProcessor = cdprop.maxThreadsPerMultiProcessor;
+    p_prop->computeMode = cdprop.computeMode;
+    p_prop->clockInstructionRate = cdprop.clockRate; // Same as clock-rate:
+
+    int ccVers = p_prop->major * 100 + p_prop->minor * 10;
+    p_prop->arch.hasGlobalInt32Atomics = (ccVers >= 110);
+    p_prop->arch.hasGlobalFloatAtomicExch = (ccVers >= 110);
+    p_prop->arch.hasSharedInt32Atomics = (ccVers >= 120);
+    p_prop->arch.hasSharedFloatAtomicExch = (ccVers >= 120);
+    p_prop->arch.hasFloatAtomicAdd = (ccVers >= 200);
+    p_prop->arch.hasGlobalInt64Atomics = (ccVers >= 120);
+    p_prop->arch.hasSharedInt64Atomics = (ccVers >= 110);
+    p_prop->arch.hasDoubles = (ccVers >= 130);
+    p_prop->arch.hasWarpVote = (ccVers >= 120);
+    p_prop->arch.hasWarpBallot = (ccVers >= 200);
+    p_prop->arch.hasWarpShuffle = (ccVers >= 300);
+    p_prop->arch.hasFunnelShift = (ccVers >= 350);
+    p_prop->arch.hasThreadFenceSystem = (ccVers >= 200);
+    p_prop->arch.hasSyncThreadsExt = (ccVers >= 200);
+    p_prop->arch.hasSurfaceFuncs = (ccVers >= 200);
+    p_prop->arch.has3dGrid = (ccVers >= 200);
+    p_prop->arch.hasDynamicParallelism = (ccVers >= 350);
+
+    p_prop->concurrentKernels = cdprop.concurrentKernels;
+    p_prop->pciDomainID = cdprop.pciDomainID;
+    p_prop->pciBusID = cdprop.pciBusID;
+    p_prop->pciDeviceID = cdprop.pciDeviceID;
+    p_prop->maxSharedMemoryPerMultiProcessor = cdprop.sharedMemPerMultiprocessor;
+    p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard;
+    p_prop->canMapHostMemory = cdprop.canMapHostMemory;
+    p_prop->gcnArch = 0; // Not a GCN arch
+    p_prop->integrated = cdprop.integrated;
+    p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
+    p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
+    p_prop->cooperativeMultiDeviceUnmatchedFunc = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0;
+
+    p_prop->maxTexture1D    = cdprop.maxTexture1D;
+    p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0];
+    p_prop->maxTexture2D[1] = cdprop.maxTexture2D[1];
+    p_prop->maxTexture3D[0] = cdprop.maxTexture3D[0];
+    p_prop->maxTexture3D[1] = cdprop.maxTexture3D[1];
+    p_prop->maxTexture3D[2] = cdprop.maxTexture3D[2];
+
+    p_prop->memPitch                 = cdprop.memPitch;
+    p_prop->textureAlignment         = cdprop.textureAlignment;
+    p_prop->texturePitchAlignment    = cdprop.texturePitchAlignment;
+    p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
+    p_prop->ECCEnabled               = cdprop.ECCEnabled;
+    p_prop->tccDriver                = cdprop.tccDriver;
+
+    return hipCUDAErrorTohipError(cerror);
+}
+
+inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) {
+    enum cudaDeviceAttr cdattr;
+    cudaError_t cerror;
+
+    switch (attr) {
+        case hipDeviceAttributeMaxThreadsPerBlock:
+            cdattr = cudaDevAttrMaxThreadsPerBlock;
+            break;
+        case hipDeviceAttributeMaxBlockDimX:
+            cdattr = cudaDevAttrMaxBlockDimX;
+            break;
+        case hipDeviceAttributeMaxBlockDimY:
+            cdattr = cudaDevAttrMaxBlockDimY;
+            break;
+        case hipDeviceAttributeMaxBlockDimZ:
+            cdattr = cudaDevAttrMaxBlockDimZ;
+            break;
+        case hipDeviceAttributeMaxGridDimX:
+            cdattr = cudaDevAttrMaxGridDimX;
+            break;
+        case hipDeviceAttributeMaxGridDimY:
+            cdattr = cudaDevAttrMaxGridDimY;
+            break;
+        case hipDeviceAttributeMaxGridDimZ:
+            cdattr = cudaDevAttrMaxGridDimZ;
+            break;
+        case hipDeviceAttributeMaxSharedMemoryPerBlock:
+            cdattr = cudaDevAttrMaxSharedMemoryPerBlock;
+            break;
+        case hipDeviceAttributeTotalConstantMemory:
+            cdattr = cudaDevAttrTotalConstantMemory;
+            break;
+        case hipDeviceAttributeWarpSize:
+            cdattr = cudaDevAttrWarpSize;
+            break;
+        case hipDeviceAttributeMaxRegistersPerBlock:
+            cdattr = cudaDevAttrMaxRegistersPerBlock;
+            break;
+        case hipDeviceAttributeClockRate:
+            cdattr = cudaDevAttrClockRate;
+            break;
+        case hipDeviceAttributeMemoryClockRate:
+            cdattr = cudaDevAttrMemoryClockRate;
+            break;
+        case hipDeviceAttributeMemoryBusWidth:
+            cdattr = cudaDevAttrGlobalMemoryBusWidth;
+            break;
+        case hipDeviceAttributeMultiprocessorCount:
+            cdattr = cudaDevAttrMultiProcessorCount;
+            break;
+        case hipDeviceAttributeComputeMode:
+            cdattr = cudaDevAttrComputeMode;
+            break;
+        case hipDeviceAttributeL2CacheSize:
+            cdattr = cudaDevAttrL2CacheSize;
+            break;
+        case hipDeviceAttributeMaxThreadsPerMultiProcessor:
+            cdattr = cudaDevAttrMaxThreadsPerMultiProcessor;
+            break;
+        case hipDeviceAttributeComputeCapabilityMajor:
+            cdattr = cudaDevAttrComputeCapabilityMajor;
+            break;
+        case hipDeviceAttributeComputeCapabilityMinor:
+            cdattr = cudaDevAttrComputeCapabilityMinor;
+            break;
+        case hipDeviceAttributeConcurrentKernels:
+            cdattr = cudaDevAttrConcurrentKernels;
+            break;
+        case hipDeviceAttributePciBusId:
+            cdattr = cudaDevAttrPciBusId;
+            break;
+        case hipDeviceAttributePciDeviceId:
+            cdattr = cudaDevAttrPciDeviceId;
+            break;
+        case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
+            cdattr = cudaDevAttrMaxSharedMemoryPerMultiprocessor;
+            break;
+        case hipDeviceAttributeIsMultiGpuBoard:
+            cdattr = cudaDevAttrIsMultiGpuBoard;
+            break;
+        case hipDeviceAttributeIntegrated:
+            cdattr = cudaDevAttrIntegrated;
+            break;
+        case hipDeviceAttributeMaxTexture1DWidth:
+            cdattr = cudaDevAttrMaxTexture1DWidth;
+            break;
+        case hipDeviceAttributeMaxTexture2DWidth:
+            cdattr = cudaDevAttrMaxTexture2DWidth;
+            break;
+        case hipDeviceAttributeMaxTexture2DHeight:
+            cdattr = cudaDevAttrMaxTexture2DHeight;
+            break;
+        case hipDeviceAttributeMaxTexture3DWidth:
+            cdattr = cudaDevAttrMaxTexture3DWidth;
+            break;
+        case hipDeviceAttributeMaxTexture3DHeight:
+            cdattr = cudaDevAttrMaxTexture3DHeight;
+            break;
+        case hipDeviceAttributeMaxTexture3DDepth:
+            cdattr = cudaDevAttrMaxTexture3DDepth;
+            break;
+        case hipDeviceAttributeMaxPitch:
+            cdattr = cudaDevAttrMaxPitch;
+            break;
+        case hipDeviceAttributeTextureAlignment:
+            cdattr = cudaDevAttrTextureAlignment;
+            break;
+        case hipDeviceAttributeTexturePitchAlignment:
+            cdattr = cudaDevAttrTexturePitchAlignment;
+            break;
+        case hipDeviceAttributeKernelExecTimeout:
+            cdattr = cudaDevAttrKernelExecTimeout;
+            break;
+        case hipDeviceAttributeCanMapHostMemory:
+            cdattr = cudaDevAttrCanMapHostMemory;
+            break;
+        case hipDeviceAttributeEccEnabled:
+            cdattr = cudaDevAttrEccEnabled;
+            break;
+        case hipDeviceAttributeCooperativeLaunch:
+            cdattr = cudaDevAttrCooperativeLaunch;
+            break;
+        case hipDeviceAttributeCooperativeMultiDeviceLaunch:
+            cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
+            break;
+        default:
+            return hipCUDAErrorTohipError(cudaErrorInvalidValue);
+    }
+
+    cerror = cudaDeviceGetAttribute(pi, cdattr, device);
+
+    return hipCUDAErrorTohipError(cerror);
+}
+
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                                                      const void* func,
+                                                                      int blockSize,
+                                                                      size_t dynamicSMemSize) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func,
+                                                              blockSize, dynamicSMemSize));
+}
+
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+                                                                      const void* func,
+                                                                      int blockSize,
+                                                                      size_t dynamicSMemSize,
+                                                                      unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
+                                                      blockSize, dynamicSMemSize, flags));
+}
+
+inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, 
+                                                                 hipFunction_t f,
+                                                                 int  blockSize,
+                                                                 size_t dynamicSMemSize ){
+    return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f,
+                                                                   blockSize, dynamicSMemSize));
+}
+
+inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+                                                                          hipFunction_t f,
+                                                                          int  blockSize,
+                                                                          size_t dynamicSMemSize,
+                                                                          unsigned int  flags ) {
+    return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks,f,
+                                                                blockSize, dynamicSMemSize, flags));
+}
+
+//TODO - Match CUoccupancyB2DSize
+inline static hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+                                             hipFunction_t f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit){
+    return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, NULL,
+                                 dynSharedMemPerBlk, blockSizeLimit));
+}
+
+//TODO - Match CUoccupancyB2DSize
+inline static hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+                                             hipFunction_t f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit, unsigned int  flags){
+    return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, f, NULL,
+                                 dynSharedMemPerBlk, blockSizeLimit, flags));
+}
+
+inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr) {
+    struct cudaPointerAttributes cPA;
+    hipError_t err = hipCUDAErrorTohipError(cudaPointerGetAttributes(&cPA, ptr));
+    if (err == hipSuccess) {
+#if (CUDART_VERSION >= 11000)
+        auto memType = cPA.type;
+#else
+        unsigned memType = cPA.memoryType; // No auto because cuda 10.2 doesnt force c++11
+#endif
+        switch (memType) {
+            case cudaMemoryTypeDevice:
+                attributes->memoryType = hipMemoryTypeDevice;
+                break;
+            case cudaMemoryTypeHost:
+                attributes->memoryType = hipMemoryTypeHost;
+                break;
+            default:
+                return hipErrorUnknown;
+        }
+        attributes->device = cPA.device;
+        attributes->devicePointer = cPA.devicePointer;
+        attributes->hostPointer = cPA.hostPointer;
+        attributes->isManaged = 0;
+        attributes->allocationFlags = 0;
+    }
+    return err;
+}
+
+inline static hipError_t hipMemGetInfo(size_t* free, size_t* total) {
+    return hipCUDAErrorTohipError(cudaMemGetInfo(free, total));
+}
+
+inline static hipError_t hipEventCreate(hipEvent_t* event) {
+    return hipCUDAErrorTohipError(cudaEventCreate(event));
+}
+
+inline static hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream __dparm(NULL)) {
+    return hipCUDAErrorTohipError(cudaEventRecord(event, stream));
+}
+
+inline static hipError_t hipEventSynchronize(hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaEventSynchronize(event));
+}
+
+inline static hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop) {
+    return hipCUDAErrorTohipError(cudaEventElapsedTime(ms, start, stop));
+}
+
+inline static hipError_t hipEventDestroy(hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaEventDestroy(event));
+}
+
+inline static hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaStreamCreateWithFlags(stream, flags));
+}
+
+inline static hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) {
+    return hipCUDAErrorTohipError(cudaStreamCreateWithPriority(stream, flags, priority));
+}
+
+inline static hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority) {
+    return hipCUDAErrorTohipError(cudaDeviceGetStreamPriorityRange(leastPriority, greatestPriority));
+}
+
+inline static hipError_t hipStreamCreate(hipStream_t* stream) {
+    return hipCUDAErrorTohipError(cudaStreamCreate(stream));
+}
+
+inline static hipError_t hipStreamSynchronize(hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
+}
+
+inline static hipError_t hipStreamDestroy(hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaStreamDestroy(stream));
+}
+
+inline static hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags) {
+    return hipCUDAErrorTohipError(cudaStreamGetFlags(stream, flags));
+}
+
+inline static hipError_t hipStreamGetPriority(hipStream_t stream, int *priority) {
+    return hipCUDAErrorTohipError(cudaStreamGetPriority(stream, priority));
+}
+
+inline static hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event,
+                                            unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaStreamWaitEvent(stream, event, flags));
+}
+
+inline static hipError_t hipStreamQuery(hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaStreamQuery(stream));
+}
+
+inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback,
+                                              void* userData, unsigned int flags) {
+    return hipCUDAErrorTohipError(
+        cudaStreamAddCallback(stream, (cudaStreamCallback_t)callback, userData, flags));
+}
+
+inline static hipError_t hipDriverGetVersion(int* driverVersion) {
+    cudaError_t err = cudaDriverGetVersion(driverVersion);
+
+    // Override driver version to match version reported on HCC side.
+    *driverVersion = 4;
+
+    return hipCUDAErrorTohipError(err);
+}
+
+inline static hipError_t hipRuntimeGetVersion(int* runtimeVersion) {
+    return hipCUDAErrorTohipError(cudaRuntimeGetVersion(runtimeVersion));
+}
+
+inline static hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, int device, int peerDevice) {
+    return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice));
+}
+
+inline static hipError_t hipDeviceDisablePeerAccess(int peerDevice) {
+    return hipCUDAErrorTohipError(cudaDeviceDisablePeerAccess(peerDevice));
+}
+
+inline static hipError_t hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess(peerDevice, flags));
+}
+
+inline static hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx) {
+    return hipCUResultTohipError(cuCtxDisablePeerAccess(peerCtx));
+}
+
+inline static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags) {
+    return hipCUResultTohipError(cuCtxEnablePeerAccess(peerCtx, flags));
+}
+
+inline static hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags,
+                                                     int* active) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxGetState(dev, flags, active));
+}
+
+inline static hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxRelease(dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxRetain(pctx, dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxReset(dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxSetFlags(dev, flags));
+}
+
+inline static hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize,
+                                               hipDeviceptr_t dptr) {
+    return hipCUResultTohipError(cuMemGetAddressRange(pbase, psize, dptr));
+}
+
+inline static hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevice,
+                                       size_t count) {
+    return hipCUDAErrorTohipError(cudaMemcpyPeer(dst, dstDevice, src, srcDevice, count));
+}
+
+inline static hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
+                                            int srcDevice, size_t count,
+                                            hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream));
+}
+
+// Profile APIs:
+inline static hipError_t hipProfilerStart() { return hipCUDAErrorTohipError(cudaProfilerStart()); }
+
+inline static hipError_t hipProfilerStop() { return hipCUDAErrorTohipError(cudaProfilerStop()); }
+
+inline static hipError_t hipGetDeviceFlags(unsigned int* flags) {
+    return hipCUDAErrorTohipError(cudaGetDeviceFlags(flags));
+}
+
+inline static hipError_t hipSetDeviceFlags(unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaSetDeviceFlags(flags));
+}
+
+inline static hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaEventCreateWithFlags(event, flags));
+}
+
+inline static hipError_t hipEventQuery(hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaEventQuery(event));
+}
+
+inline static hipError_t hipCtxCreate(hipCtx_t* ctx, unsigned int flags, hipDevice_t device) {
+    return hipCUResultTohipError(cuCtxCreate(ctx, flags, device));
+}
+
+inline static hipError_t hipCtxDestroy(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxDestroy(ctx));
+}
+
+inline static hipError_t hipCtxPopCurrent(hipCtx_t* ctx) {
+    return hipCUResultTohipError(cuCtxPopCurrent(ctx));
+}
+
+inline static hipError_t hipCtxPushCurrent(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxPushCurrent(ctx));
+}
+
+inline static hipError_t hipCtxSetCurrent(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxSetCurrent(ctx));
+}
+
+inline static hipError_t hipCtxGetCurrent(hipCtx_t* ctx) {
+    return hipCUResultTohipError(cuCtxGetCurrent(ctx));
+}
+
+inline static hipError_t hipCtxGetDevice(hipDevice_t* device) {
+    return hipCUResultTohipError(cuCtxGetDevice(device));
+}
+
+inline static hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion) {
+    return hipCUResultTohipError(cuCtxGetApiVersion(ctx, (unsigned int*)apiVersion));
+}
+
+inline static hipError_t hipCtxGetCacheConfig(hipFuncCache* cacheConfig) {
+    return hipCUResultTohipError(cuCtxGetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipCtxSetCacheConfig(hipFuncCache cacheConfig) {
+    return hipCUResultTohipError(cuCtxSetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config) {
+    return hipCUResultTohipError(cuCtxSetSharedMemConfig((CUsharedconfig)config));
+}
+
+inline static hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig) {
+    return hipCUResultTohipError(cuCtxGetSharedMemConfig((CUsharedconfig*)pConfig));
+}
+
+inline static hipError_t hipCtxSynchronize(void) {
+    return hipCUResultTohipError(cuCtxSynchronize());
+}
+
+inline static hipError_t hipCtxGetFlags(unsigned int* flags) {
+    return hipCUResultTohipError(cuCtxGetFlags(flags));
+}
+
+inline static hipError_t hipCtxDetach(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxDetach(ctx));
+}
+
+inline static hipError_t hipDeviceGet(hipDevice_t* device, int ordinal) {
+    return hipCUResultTohipError(cuDeviceGet(device, ordinal));
+}
+
+inline static hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device) {
+    return hipCUResultTohipError(cuDeviceComputeCapability(major, minor, device));
+}
+
+inline static hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device) {
+    return hipCUResultTohipError(cuDeviceGetName(name, len, device));
+}
+
+inline static hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
+                                                  int srcDevice, int dstDevice) {
+    return hipCUDAErrorTohipError(cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice));
+}
+
+inline static hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, hipDevice_t device) {
+    return hipCUDAErrorTohipError(cudaDeviceGetPCIBusId(pciBusId, len, device));
+}
+
+inline static hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId) {
+    return hipCUDAErrorTohipError(cudaDeviceGetByPCIBusId(device, pciBusId));
+}
+
+inline static hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* config) {
+    return hipCUDAErrorTohipError(cudaDeviceGetSharedMemConfig(config));
+}
+
+inline static hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) {
+    return hipCUDAErrorTohipError(cudaDeviceSetSharedMemConfig(config));
+}
+
+inline static hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
+    return hipCUDAErrorTohipError(cudaDeviceGetLimit(pValue, limit));
+}
+
+inline static hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device) {
+    return hipCUResultTohipError(cuDeviceTotalMem(bytes, device));
+}
+
+inline static hipError_t hipModuleLoad(hipModule_t* module, const char* fname) {
+    return hipCUResultTohipError(cuModuleLoad(module, fname));
+}
+
+inline static hipError_t hipModuleUnload(hipModule_t hmod) {
+    return hipCUResultTohipError(cuModuleUnload(hmod));
+}
+
+inline static hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module,
+                                              const char* kname) {
+    return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
+}
+
+inline static hipError_t hipModuleGetTexRef(hipTexRef* pTexRef, hipModule_t hmod, const char* name){
+    hipCUResultTohipError(cuModuleGetTexRef(pTexRef, hmod, name));
+}
+
+inline static hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) {
+    return hipCUDAErrorTohipError(cudaFuncGetAttributes(attr, func));
+}
+
+inline static hipError_t hipFuncGetAttribute (int* value, hipFunction_attribute attrib, hipFunction_t hfunc) {
+    return hipCUResultTohipError(cuFuncGetAttribute(value, attrib, hfunc));
+}
+
+inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
+                                            const char* name) {
+    return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
+}
+
+inline static hipError_t hipModuleLoadData(hipModule_t* module, const void* image) {
+    return hipCUResultTohipError(cuModuleLoadData(module, image));
+}
+
+inline static hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image,
+                                             unsigned int numOptions, hipJitOption* options,
+                                             void** optionValues) {
+    return hipCUResultTohipError(
+        cuModuleLoadDataEx(module, image, numOptions, options, optionValues));
+}
+
+inline static hipError_t hipLaunchKernel(const void* function_address, dim3 numBlocks,
+					 dim3 dimBlocks, void** args, size_t sharedMemBytes,
+					 hipStream_t stream)
+{
+   return hipCUDAErrorTohipError(cudaLaunchKernel(function_address,numBlocks,dimBlocks,args,sharedMemBytes,stream));
+}
+
+inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
+                                               unsigned int gridDimY, unsigned int gridDimZ,
+                                               unsigned int blockDimX, unsigned int blockDimY,
+                                               unsigned int blockDimZ, unsigned int sharedMemBytes,
+                                               hipStream_t stream, void** kernelParams,
+                                               void** extra) {
+    return hipCUResultTohipError(cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX,
+                                                blockDimY, blockDimZ, sharedMemBytes, stream,
+                                                kernelParams, extra));
+}
+
+inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) {
+    return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset,
+                                                         struct textureReference* tex,
+                                                         const void* devPtr,
+                                                         const hipChannelFormatDesc* desc,
+                                                         size_t size __dparm(UINT_MAX)) {
+    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipBindTexture2D(
+    size_t* offset, struct textureReference* tex, const void* devPtr,
+    const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) {
+    return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
+}
+
+inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+                                                        hipChannelFormatKind f) {
+    return cudaCreateChannelDesc(x, y, z, w, hipChannelFormatKindToCudaChannelFormatKind(f));
+}
+
+inline static hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
+                                                const hipResourceDesc* pResDesc,
+                                                const hipTextureDesc* pTexDesc,
+                                                const hipResourceViewDesc* pResViewDesc) {
+    return hipCUDAErrorTohipError(
+        cudaCreateTextureObject(pTexObject, pResDesc, pTexDesc, pResViewDesc));
+}
+
+inline static hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject) {
+    return hipCUDAErrorTohipError(cudaDestroyTextureObject(textureObject));
+}
+
+inline static hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject,
+                                                const hipResourceDesc* pResDesc) {
+    return hipCUDAErrorTohipError(cudaCreateSurfaceObject(pSurfObject, pResDesc));
+}
+
+inline static hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject) {
+    return hipCUDAErrorTohipError(cudaDestroySurfaceObject(surfaceObject));
+}
+
+inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc,
+                                           hipTextureObject_t textureObject) {
+    return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
+    size_t* offset, const struct textureReference* texref) {
+    return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));
+}
+
+inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
+{
+    return hipCUDAErrorTohipError(cudaGetChannelDesc(desc,array));
+}
+
+inline static hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
+                                      void** kernelParams, unsigned int sharedMemBytes,
+                                      hipStream_t stream) {
+    return hipCUDAErrorTohipError(
+            cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
+}
+
+inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices, unsigned int  flags) {
+    return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __CUDACC__
+
+template<class T>
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                                                      T func,
+                                                                      int blockSize,
+                                                                      size_t dynamicSMemSize) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func,
+                                                            blockSize, dynamicSMemSize));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, T func,
+                                                           size_t dynamicSMemSize = 0,
+                                                           int blockSizeLimit = 0) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func,
+                                                           dynamicSMemSize, blockSizeLimit));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* minGridSize, int* blockSize, T func,
+                                                           size_t dynamicSMemSize = 0,
+                                                           int blockSizeLimit = 0, unsigned int  flags = 0) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func,
+                                                           dynamicSMemSize, blockSizeLimit, flags));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int* numBlocks, T func,
+                                              int  blockSize, size_t dynamicSMemSize,unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
+                                                                 blockSize, dynamicSMemSize, flags));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+inline static hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
+                                        const void* devPtr, size_t size = UINT_MAX) {
+    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, size));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+inline static hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex,
+                                        const void* devPtr, const hipChannelFormatDesc& desc,
+                                        size_t size = UINT_MAX) {
+    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>* tex) {
+    return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>& tex) {
+    return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>& tex, hipArray_const_t array,
+    const hipChannelFormatDesc& desc) {
+    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>* tex, hipArray_const_t array,
+    const hipChannelFormatDesc* desc) {
+    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
+    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array));
+}
+
+template <class T>
+inline static hipChannelFormatDesc hipCreateChannelDesc() {
+    return cudaCreateChannelDesc<T>();
+}
+
+template <class T>
+inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
+                                             void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
+    return hipCUDAErrorTohipError(
+            cudaLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim, blockDim, kernelParams, sharedMemBytes, stream));
+}
+
+inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipAddress_mode am){
+    return hipCUResultTohipError(cuTexRefSetAddressMode(hTexRef,dim,am));
+}
+
+inline static hipError_t hipTexRefSetFilterMode(hipTexRef hTexRef, hipFilter_mode fm){
+    return hipCUResultTohipError(cuTexRefSetFilterMode(hTexRef,fm));
+}
+
+inline static hipError_t hipTexRefSetAddress(size_t *ByteOffset, hipTexRef hTexRef, hipDeviceptr_t dptr, size_t bytes){
+   return hipCUResultTohipError(cuTexRefSetAddress(ByteOffset,hTexRef,dptr,bytes));
+}
+
+inline static hipError_t hipTexRefSetAddress2D(hipTexRef hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, hipDeviceptr_t dptr, size_t Pitch){
+   return hipCUResultTohipError(cuTexRefSetAddress2D(hTexRef,desc,dptr,Pitch));
+}
+
+inline static hipError_t hipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents){
+   return hipCUResultTohipError(cuTexRefSetFormat(hTexRef,fmt,NumPackedComponents));
+}
+
+inline static hipError_t hipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags){
+   return hipCUResultTohipError(cuTexRefSetFlags(hTexRef,Flags));
+}
+
+inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, hiparray hArray, unsigned int Flags){
+   return hipCUResultTohipError(cuTexRefSetArray(hTexRef,hArray,Flags));
+}
+
+inline static hipError_t hipArrayCreate(hiparray* pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray){
+   return hipCUResultTohipError(cuArrayCreate(pHandle, pAllocateArray));
+}
+
+inline static hipError_t hipArrayDestroy(hiparray hArray){
+   return hipCUResultTohipError(cuArrayDestroy(hArray));
+}
+
+#endif  //__CUDACC__
+
+#endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
diff --git a/hipnv/include/hip/nvidia_detail/hip_texture_types.h b/hipnv/include/hip/nvidia_detail/hip_texture_types.h
new file mode 100644
index 0000000000..df374d705a
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/hip_texture_types.h
@@ -0,0 +1,6 @@
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
+
+#include <texture_types.h>
+
+#endif

From eeaf35eda368a8ae045462f70a43618a02049a5c Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Wed, 27 Jan 2021 09:00:28 -0500
Subject: [PATCH 002/177] SWDEV-269784 - managed memory support for HIP CUDA

Change-Id: I01f9fc64573f402031eceab24395e5cbd93007f9
---
 .../hip/nvidia_detail/hip_runtime_api.h       | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
index eb3df19bc4..827374da5d 100644
--- a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
@@ -63,6 +63,22 @@ typedef enum hipMemcpyKind {
     hipMemcpyDefault
 } hipMemcpyKind;
 
+typedef enum hipMemoryAdvise {
+    hipMemAdviseSetReadMostly,
+    hipMemAdviseUnsetReadMostly,
+    hipMemAdviseSetPreferredLocation,
+    hipMemAdviseUnsetPreferredLocation,
+    hipMemAdviseSetAccessedBy,
+    hipMemAdviseUnsetAccessedBy
+} hipMemoryAdvise;
+
+typedef enum hipMemRangeAttribute {
+    hipMemRangeAttributeReadMostly,
+    hipMemRangeAttributePreferredLocation,
+    hipMemRangeAttributeAccessedBy,
+    hipMemRangeAttributeLastPrefetchLocation
+} hipMemRangeAttribute;
+
 // hipDataType
 #define hipDataType cudaDataType
 #define HIP_R_16F CUDA_R_16F
@@ -250,6 +266,7 @@ typedef enum cudaChannelFormatKind hipChannelFormatKind;
 
 #define hipMemAttachGlobal cudaMemAttachGlobal
 #define hipMemAttachHost cudaMemAttachHost
+#define hipMemAttachSingle cudaMemAttachSingle
 
 #define hipHostRegisterDefault cudaHostRegisterDefault
 #define hipHostRegisterPortable cudaHostRegisterPortable
@@ -336,6 +353,8 @@ typedef cudaSurfaceObject_t hipSurfaceObject_t;
 #define hipTextureType3D cudaTextureType3D
 #define hipDeviceMapHost cudaDeviceMapHost
 
+#define hipCpuDeviceId cudaCpuDeviceId
+#define hipInvalidDeviceId cudaInvalidDeviceId
 typedef struct cudaExtent hipExtent;
 typedef struct cudaPitchedPtr hipPitchedPtr;
 #define make_hipExtent make_cudaExtent
@@ -798,6 +817,42 @@ inline static enum cudaTextureAddressMode hipTextureAddressModeToCudaTextureAddr
     }
 }
 
+inline static enum cudaMemRangeAttribute hipMemRangeAttributeTocudaMemRangeAttribute(
+   hipMemRangeAttribute kind) {
+   switch (kind) {
+       case hipMemRangeAttributeReadMostly:
+           return cudaMemRangeAttributeReadMostly;
+       case hipMemRangeAttributePreferredLocation:
+           return cudaMemRangeAttributePreferredLocation;
+       case hipMemRangeAttributeAccessedBy:
+           return cudaMemRangeAttributeAccessedBy;
+       case hipMemRangeAttributeLastPrefetchLocation:
+           return cudaMemRangeAttributeLastPrefetchLocation;
+       default:
+           return cudaMemRangeAttributeReadMostly;
+   }
+}
+
+inline static enum cudaMemoryAdvise hipMemoryAdviseTocudaMemoryAdvise(
+    hipMemoryAdvise kind) {
+   switch (kind) {
+       case hipMemAdviseSetReadMostly:
+           return cudaMemAdviseSetReadMostly;
+       case hipMemAdviseUnsetReadMostly :
+           return cudaMemAdviseUnsetReadMostly ;
+       case hipMemAdviseSetPreferredLocation:
+           return cudaMemAdviseSetPreferredLocation;
+       case hipMemAdviseUnsetPreferredLocation:
+           return cudaMemAdviseUnsetPreferredLocation;
+       case hipMemAdviseSetAccessedBy:
+           return cudaMemAdviseSetAccessedBy;
+       case hipMemAdviseUnsetAccessedBy:
+           return cudaMemAdviseUnsetAccessedBy;
+       default:
+           return cudaMemAdviseSetReadMostly;
+   }
+}
+
 inline static enum cudaTextureFilterMode hipTextureFilterModeToCudaTextureFilterMode(
     hipTextureFilterMode kind) {
     switch (kind) {
@@ -894,6 +949,39 @@ inline static hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int fla
     return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
 }
 
+inline static hipError_t hipMemAdvise(const void* dev_ptr, size_t count, hipMemoryAdvise advice,
+                                      int device) {
+    return hipCUDAErrorTohipError(cudaMemAdvise(dev_ptr, count,
+        hipMemoryAdviseTocudaMemoryAdvise(advice), device));
+}
+
+inline static hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
+                                             hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemPrefetchAsync(dev_ptr, count, device, stream));
+}
+
+inline static hipError_t hipMemRangeGetAttribute(void* data, size_t data_size,
+                                                 hipMemRangeAttribute attribute,
+                                                 const void* dev_ptr, size_t count) {
+    return hipCUDAErrorTohipError(cudaMemRangeGetAttribute(data, data_size,
+        hipMemRangeAttributeTocudaMemRangeAttribute(attribute), dev_ptr, count));
+}
+
+inline static hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
+                                                  hipMemRangeAttribute* attributes,
+                                                  size_t num_attributes, const void* dev_ptr,
+                                                  size_t count) {
+    auto attrs = hipMemRangeAttributeTocudaMemRangeAttribute(*attributes);
+    return hipCUDAErrorTohipError(cudaMemRangeGetAttributes(data, data_sizes, &attrs,
+        num_attributes, dev_ptr, count));
+}
+
+inline static hipError_t hipStreamAttachMemAsync(hipStream_t stream, hipDeviceptr_t* dev_ptr,
+                                                 size_t length __dparm(0),
+                                                 unsigned int flags __dparm(hipMemAttachSingle)) {
+    return hipCUDAErrorTohipError(cudaStreamAttachMemAsync(stream, dev_ptr, length, flags));
+}
+
 inline static hipError_t hipMallocManaged(void** ptr, size_t size, unsigned int flags) {
     return hipCUDAErrorTohipError(cudaMallocManaged(ptr, size, flags));
 }

From 1bb69ec78a9b9b32438121a2d2706c3276744e79 Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Thu, 4 Feb 2021 16:22:01 -0500
Subject: [PATCH 003/177] SWDEV-271416 - Remove HIP_DYNAMIC_SHARED macro in hip

Change-Id: I12f39ea8438eb7ce76d8ffb2151b4faa93689048
---
 hipnv/include/hip/nvidia_detail/hip_runtime.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime.h b/hipnv/include/hip/nvidia_detail/hip_runtime.h
index 84414fb4a3..dfe41cf3c3 100644
--- a/hipnv/include/hip/nvidia_detail/hip_runtime.h
+++ b/hipnv/include/hip/nvidia_detail/hip_runtime.h
@@ -95,14 +95,6 @@ typedef int hipLaunchParm;
 
 #define HIP_SYMBOL(X) &X
 
-/**
- * extern __shared__
- */
-
-#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
-
-#define HIP_DYNAMIC_SHARED_ATTRIBUTE
-
 #ifdef __HIP_DEVICE_COMPILE__
 #define abort_()                                                                                    \
     { asm("trap;"); }

From 27378a434166ce8e72c68716720b7d66801d972f Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Tue, 16 Feb 2021 00:09:32 -0500
Subject: [PATCH 004/177] SWDEV-271416 - Keep HIP_DYNAMIC_SHARED macro for
 compat with existing HIP apps

Change-Id: I536e0c65d6e8696cbc369082350834abd710abca
---
 hipnv/include/hip/nvidia_detail/hip_runtime.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime.h b/hipnv/include/hip/nvidia_detail/hip_runtime.h
index dfe41cf3c3..a42fecc611 100644
--- a/hipnv/include/hip/nvidia_detail/hip_runtime.h
+++ b/hipnv/include/hip/nvidia_detail/hip_runtime.h
@@ -95,6 +95,13 @@ typedef int hipLaunchParm;
 
 #define HIP_SYMBOL(X) &X
 
+/**
+ * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
+ * To be removed in a future release.
+ */
+#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
 #ifdef __HIP_DEVICE_COMPILE__
 #define abort_()                                                                                    \
     { asm("trap;"); }

From 6086537b5df7f6f1a785fb74f03aee39381b9da6 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Mon, 8 Feb 2021 12:48:07 -0500
Subject: [PATCH 005/177] SWDEV-271491 - Fix flags for hipSetDeviceFlags on
 HIP-CUDA path

Change-Id: I29446d5cc5a26a4b83fa45175ccdf1d8f3a9ea40
---
 hipnv/include/hip/nvidia_detail/hip_runtime_api.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
index 827374da5d..b195b19436 100644
--- a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
@@ -351,7 +351,14 @@ typedef cudaSurfaceObject_t hipSurfaceObject_t;
 #define hipTextureType2D cudaTextureType2D
 #define hipTextureType2DLayered cudaTextureType2DLayered
 #define hipTextureType3D cudaTextureType3D
+
+#define hipDeviceScheduleAuto cudaDeviceScheduleAuto
+#define hipDeviceScheduleSpin cudaDeviceScheduleSpin
+#define hipDeviceScheduleYield cudaDeviceScheduleYield
+#define hipDeviceScheduleBlockingSync cudaDeviceScheduleBlockingSync
+#define hipDeviceScheduleMask cudaDeviceScheduleMask
 #define hipDeviceMapHost cudaDeviceMapHost
+#define hipDeviceLmemResizeToMax cudaDeviceLmemResizeToMax
 
 #define hipCpuDeviceId cudaCpuDeviceId
 #define hipInvalidDeviceId cudaInvalidDeviceId

From c9f5e744f3850b6871b65c61cc5b8674a816b79e Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Fri, 26 Feb 2021 14:32:12 +0000
Subject: [PATCH 006/177] SWDEV-274404 - Add hipDrvMemcpy3D* and
 hipMemcpy2DFromArray* APIs on HIP CUDA

Change-Id: I4aba2bff60a7bae6b01b6e471968227b0df8e192
---
 .../hip/nvidia_detail/hip_runtime_api.h       | 49 +++++++++++++++++--
 1 file changed, 45 insertions(+), 4 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
index b195b19436..29b26489f8 100644
--- a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
@@ -95,6 +95,7 @@ typedef enum hipMemRangeAttribute {
 #define HIP_LIBRARY_PATCH_LEVEL PATCH_LEVEL
 
 #define HIP_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR
+#define HIP_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR
 
 //hipArray_Format
 #define HIP_AD_FORMAT_UNSIGNED_INT8   CU_AD_FORMAT_UNSIGNED_INT8
@@ -337,6 +338,7 @@ typedef struct cudaFuncAttributes hipFuncAttributes;
 typedef struct cudaLaunchParams hipLaunchParams;
 #define hipFunction_attribute CUfunction_attribute
 #define hip_Memcpy2D CUDA_MEMCPY2D
+#define HIP_MEMCPY3D CUDA_MEMCPY3D
 #define hipMemcpy3DParms cudaMemcpy3DParms
 #define hipArrayDefault cudaArrayDefault
 #define hipArrayLayered cudaArrayLayered
@@ -1164,16 +1166,22 @@ inline static hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStr
   return hipCUResultTohipError(cuMemcpy2DAsync(pCopy, stream));
 }
 
-inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p)
-{
+inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) {
     return hipCUDAErrorTohipError(cudaMemcpy3D(p));
 }
 
-inline static hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms *p, hipStream_t stream)
-{
+inline static hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms *p, hipStream_t stream) {
     return hipCUDAErrorTohipError(cudaMemcpy3DAsync(p, stream));
 }
 
+inline static hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy) {
+    return hipCUResultTohipError(cuMemcpy3D(pCopy));
+}
+
+inline static hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpy3DAsync(pCopy, stream));
+}
+
 inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch,
                                           size_t width, size_t height, hipMemcpyKind kind,
                                           hipStream_t stream) {
@@ -1181,6 +1189,24 @@ inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void*
                                                     hipMemcpyKindToCudaMemcpyKind(kind), stream));
 }
 
+inline static hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray* src,
+                                              size_t wOffset, size_t hOffset, size_t width,
+                                              size_t height, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DFromArray(dst, dpitch, src, wOffset, hOffset, width,
+                                                        height,
+                                                        hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray* src,
+                                                   size_t wOffset, size_t hOffset, size_t width,
+                                                   size_t height, hipMemcpyKind kind,
+                                                   hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DFromArrayAsync(dst, dpitch, src, wOffset, hOffset,
+                                                             width, height,
+                                                             hipMemcpyKindToCudaMemcpyKind(kind),
+                                                             stream));
+}
+
 inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset,
                                             const void* src, size_t spitch, size_t width,
                                             size_t height, hipMemcpyKind kind) {
@@ -1188,6 +1214,16 @@ inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_
                                                       height, hipMemcpyKindToCudaMemcpyKind(kind)));
 }
 
+inline static hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset,
+                                                 const void* src, size_t spitch, size_t width,
+                                                 size_t height, hipMemcpyKind kind,
+                                                 hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, src, spitch,
+                                                           width, height,
+                                                           hipMemcpyKindToCudaMemcpyKind(kind),
+                                                           stream));
+}
+
 __HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset,
                                                            size_t hOffset, const void* src,
                                                            size_t count, hipMemcpyKind kind) {
@@ -2135,6 +2171,11 @@ inline static hipError_t hipArrayDestroy(hiparray hArray){
    return hipCUResultTohipError(cuArrayDestroy(hArray));
 }
 
+inline static hipError_t hipArray3DCreate(hiparray* pHandle,
+                                          const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray){
+   return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
+}
+
 #endif  //__CUDACC__
 
 #endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H

From 4fc75f95363271ef55ba73b395d70584d0056e27 Mon Sep 17 00:00:00 2001
From: Tao Sang <tao.sang@amd.com>
Date: Mon, 19 Apr 2021 21:58:36 -0400
Subject: [PATCH 007/177] SWDEV-279657 - Fix hipMallocManaged-N256M failure

Add concurrentManagedAccess detection in hipMallocManaged test.
Skip test when device doesn't support concurrentManagedAccess.

Change-Id: Ie54046feef3baba857a7068972ec1fc1a60c2dfd
---
 hipnv/include/hip/nvidia_detail/hip_runtime_api.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
index 29b26489f8..558bfe5195 100644
--- a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
@@ -1561,6 +1561,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
         case hipDeviceAttributeCooperativeMultiDeviceLaunch:
             cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
             break;
+        case hipDeviceAttributeConcurrentManagedAccess:
+            cdattr = cudaDevAttrConcurrentManagedAccess;
+            break;
         default:
             return hipCUDAErrorTohipError(cudaErrorInvalidValue);
     }

From fff92fd60ec495427bc44c1ca382b7712005432d Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <Anusha.GodavarthySurya@amd.com>
Date: Mon, 19 Apr 2021 11:25:05 -0700
Subject: [PATCH 008/177] SWDEV-282361 - HIP support for NVRTC

Change-Id: I82869af8194b595f9a2fb64ad034630786189335
---
 hipnv/include/hip/nvidia_detail/hiprtc.h | 168 +++++++++++++++++++++++
 1 file changed, 168 insertions(+)
 create mode 100644 hipnv/include/hip/nvidia_detail/hiprtc.h

diff --git a/hipnv/include/hip/nvidia_detail/hiprtc.h b/hipnv/include/hip/nvidia_detail/hiprtc.h
new file mode 100644
index 0000000000..449ba26c0f
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/hiprtc.h
@@ -0,0 +1,168 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef HIPRTC_H
+#define HIPRTC_H
+
+#include <cuda.h>
+#include <nvrtc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>
+
+#if !defined(_WIN32)
+#pragma GCC visibility push(default)
+#endif
+
+typedef enum hiprtcResult {
+  HIPRTC_SUCCESS = 0,
+  HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+  HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  HIPRTC_ERROR_INVALID_INPUT = 3,
+  HIPRTC_ERROR_INVALID_PROGRAM = 4,
+  HIPRTC_ERROR_INVALID_OPTION = 5,
+  HIPRTC_ERROR_COMPILATION = 6,
+  HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  HIPRTC_ERROR_INTERNAL_ERROR = 11
+} hiprtcResult;
+
+inline static nvrtcResult hiprtcResultTonvrtcResult(hiprtcResult result) {
+  switch (result) {
+    case HIPRTC_SUCCESS:
+      return NVRTC_SUCCESS;
+    case HIPRTC_ERROR_OUT_OF_MEMORY:
+      return NVRTC_ERROR_OUT_OF_MEMORY;
+    case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      return NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+    case HIPRTC_ERROR_INVALID_INPUT:
+      return NVRTC_ERROR_INVALID_INPUT;
+    case HIPRTC_ERROR_INVALID_PROGRAM:
+      return NVRTC_ERROR_INVALID_PROGRAM;
+    case HIPRTC_ERROR_INVALID_OPTION:
+      return NVRTC_ERROR_INVALID_OPTION;
+    case HIPRTC_ERROR_COMPILATION:
+      return NVRTC_ERROR_COMPILATION;
+    case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      return NVRTC_ERROR_BUILTIN_OPERATION_FAILURE;
+    case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      return NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
+    case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      return NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
+    case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      return NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+    case HIPRTC_ERROR_INTERNAL_ERROR:
+      return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+}
+
+inline static hiprtcResult nvrtcResultTohiprtcResult(nvrtcResult result) {
+  switch (result) {
+    case NVRTC_SUCCESS:
+      return HIPRTC_SUCCESS;
+    case NVRTC_ERROR_OUT_OF_MEMORY:
+      return HIPRTC_ERROR_OUT_OF_MEMORY;
+    case NVRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      return HIPRTC_ERROR_PROGRAM_CREATION_FAILURE;
+    case NVRTC_ERROR_INVALID_INPUT:
+      return HIPRTC_ERROR_INVALID_INPUT;
+    case NVRTC_ERROR_INVALID_PROGRAM:
+      return HIPRTC_ERROR_INVALID_PROGRAM;
+    case NVRTC_ERROR_INVALID_OPTION:
+      return HIPRTC_ERROR_INVALID_OPTION;
+    case NVRTC_ERROR_COMPILATION:
+      return HIPRTC_ERROR_COMPILATION;
+    case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      return HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE;
+    case NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      return HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
+    case NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      return HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
+    case NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+    case NVRTC_ERROR_INTERNAL_ERROR:
+      return HIPRTC_ERROR_INTERNAL_ERROR;
+  }
+}
+
+const char* hiprtcGetErrorString(hiprtcResult result) {
+  return nvrtcGetErrorString(hiprtcResultTonvrtcResult(result));
+}
+
+hiprtcResult hiprtcVersion(int* major, int* minor) {
+  return nvrtcResultTohiprtcResult(nvrtcVersion(major, minor));
+}
+
+typedef nvrtcProgram hiprtcProgram;
+
+hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression) {
+  return nvrtcResultTohiprtcResult(nvrtcAddNameExpression(prog, name_expression));
+}
+
+hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions, const char** options) {
+  return nvrtcResultTohiprtcResult(nvrtcCompileProgram(prog, numOptions, options));
+}
+
+hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src, const char* name,
+                                 int numHeaders, const char** headers, const char** includeNames) {
+  return nvrtcResultTohiprtcResult(
+      nvrtcCreateProgram(prog, src, name, numHeaders, headers, includeNames));
+}
+
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog) {
+  return nvrtcResultTohiprtcResult(nvrtcDestroyProgram(prog));
+}
+
+hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog, const char* name_expression,
+                                  const char** lowered_name) {
+  return nvrtcResultTohiprtcResult(nvrtcGetLoweredName(prog, name_expression, lowered_name));
+}
+
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log) {
+  return nvrtcResultTohiprtcResult(nvrtcGetProgramLog(prog, log));
+}
+
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet) {
+  return nvrtcResultTohiprtcResult(nvrtcGetProgramLogSize(prog, logSizeRet));
+}
+
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code) {
+  return nvrtcResultTohiprtcResult(nvrtcGetPTX(prog, code));
+}
+
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet) {
+  return nvrtcResultTohiprtcResult(nvrtcGetPTXSize(prog, codeSizeRet));
+}
+
+#if !defined(_WIN32)
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif  // HIPRTC_H

From 49f81037f359bf175f004af1acbcb21dc79806b5 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Fri, 16 Apr 2021 13:59:23 +0000
Subject: [PATCH 009/177] SWDEV-281789 - Add missing HMM attributes on CUDA
 path

Change-Id: I11167eea006301e01a1f91708bf6d0ac832d8b7f
---
 hipnv/include/hip/nvidia_detail/hip_runtime_api.h | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
index 558bfe5195..1bac52f424 100644
--- a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
@@ -1564,6 +1564,18 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
         case hipDeviceAttributeConcurrentManagedAccess:
             cdattr = cudaDevAttrConcurrentManagedAccess;
             break;
+        case hipDeviceAttributeManagedMemory:
+            cdattr = cudaDevAttrManagedMemory;
+            break;
+        case hipDeviceAttributePageableMemoryAccessUsesHostPageTables:
+            cdattr = cudaDevAttrPageableMemoryAccessUsesHostPageTables;
+            break;
+        case hipDeviceAttributePageableMemoryAccess:
+            cdattr = cudaDevAttrPageableMemoryAccess;
+            break;
+        case hipDeviceAttributeDirectManagedMemAccessFromHost:
+            cdattr = cudaDevAttrDirectManagedMemAccessFromHost;
+            break;
         default:
             return hipCUDAErrorTohipError(cudaErrorInvalidValue);
     }

From 5165e3650a8db95fc64e3ccbb58485598defd078 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Wed, 28 Apr 2021 17:28:23 +0000
Subject: [PATCH 010/177] SWDEV-283388 - Fix hipMemRangeGetAttributes on Nvidia
 Platform

Change-Id: I5daeacd9dd5c6ce7f914d6e6e45dd41fb2a675a5
hipMemRangeGetAttributes was returning hipErrorInvalidValue due to improper
mapping of the arguments to cudaMemRangeGetAttributes.
---
 .../hip/nvidia_detail/hip_runtime_api.h       | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
index 1bac52f424..66e4743abd 100644
--- a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
@@ -72,13 +72,6 @@ typedef enum hipMemoryAdvise {
     hipMemAdviseUnsetAccessedBy
 } hipMemoryAdvise;
 
-typedef enum hipMemRangeAttribute {
-    hipMemRangeAttributeReadMostly,
-    hipMemRangeAttributePreferredLocation,
-    hipMemRangeAttributeAccessedBy,
-    hipMemRangeAttributeLastPrefetchLocation
-} hipMemRangeAttribute;
-
 // hipDataType
 #define hipDataType cudaDataType
 #define HIP_R_16F CUDA_R_16F
@@ -228,6 +221,13 @@ typedef enum cudaChannelFormatKind hipChannelFormatKind;
 #define hipChannelFormatKindFloat       cudaChannelFormatKindFloat
 #define hipChannelFormatKindNone        cudaChannelFormatKindNone
 
+// hipMemRangeAttribute
+typedef enum cudaMemRangeAttribute hipMemRangeAttribute;
+#define hipMemRangeAttributeReadMostly cudaMemRangeAttributeReadMostly
+#define hipMemRangeAttributePreferredLocation cudaMemRangeAttributePreferredLocation
+#define hipMemRangeAttributeAccessedBy cudaMemRangeAttributeAccessedBy
+#define hipMemRangeAttributeLastPrefetchLocation cudaMemRangeAttributeLastPrefetchLocation
+
 #define hipSurfaceBoundaryMode cudaSurfaceBoundaryMode
 #define hipBoundaryModeZero cudaBoundaryModeZero
 #define hipBoundaryModeTrap cudaBoundaryModeTrap
@@ -826,7 +826,7 @@ inline static enum cudaTextureAddressMode hipTextureAddressModeToCudaTextureAddr
     }
 }
 
-inline static enum cudaMemRangeAttribute hipMemRangeAttributeTocudaMemRangeAttribute(
+inline static enum cudaMemRangeAttribute hipMemRangeAttributeToCudaMemRangeAttribute(
    hipMemRangeAttribute kind) {
    switch (kind) {
        case hipMemRangeAttributeReadMostly:
@@ -973,15 +973,14 @@ inline static hipError_t hipMemRangeGetAttribute(void* data, size_t data_size,
                                                  hipMemRangeAttribute attribute,
                                                  const void* dev_ptr, size_t count) {
     return hipCUDAErrorTohipError(cudaMemRangeGetAttribute(data, data_size,
-        hipMemRangeAttributeTocudaMemRangeAttribute(attribute), dev_ptr, count));
+        hipMemRangeAttributeToCudaMemRangeAttribute(attribute), dev_ptr, count));
 }
 
 inline static hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
                                                   hipMemRangeAttribute* attributes,
                                                   size_t num_attributes, const void* dev_ptr,
                                                   size_t count) {
-    auto attrs = hipMemRangeAttributeTocudaMemRangeAttribute(*attributes);
-    return hipCUDAErrorTohipError(cudaMemRangeGetAttributes(data, data_sizes, &attrs,
+    return hipCUDAErrorTohipError(cudaMemRangeGetAttributes(data, data_sizes, attributes,
         num_attributes, dev_ptr, count));
 }
 

From 358b6fe5c4c5ee10d549e1bfe5c770b290448bb7 Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Fri, 28 May 2021 23:15:18 +0000
Subject: [PATCH 011/177] SWDEV-288707 - Merge HIP refactored code to staging

These changes move-
- rocclr based implementation inside src/hipamd/src
- platform specific (both nvidia and amd) headers inside
src/hipamd/include/hip

Change-Id: Ia29791a727244952591fe1d813dcef0303b73a9e
---
 .../hip/nvidia_detail/channel_descriptor.h    |   28 -
 hipnv/include/hip/nvidia_detail/hip_complex.h |  119 -
 .../nvidia_detail/hip_cooperative_groups.h    |   12 -
 hipnv/include/hip/nvidia_detail/hip_runtime.h |  122 -
 .../hip/nvidia_detail/hip_runtime_api.h       | 2195 -----------------
 .../hip/nvidia_detail/hip_texture_types.h     |    6 -
 hipnv/include/hip/nvidia_detail/hiprtc.h      |  168 --
 7 files changed, 2650 deletions(-)
 delete mode 100644 hipnv/include/hip/nvidia_detail/channel_descriptor.h
 delete mode 100644 hipnv/include/hip/nvidia_detail/hip_complex.h
 delete mode 100644 hipnv/include/hip/nvidia_detail/hip_cooperative_groups.h
 delete mode 100644 hipnv/include/hip/nvidia_detail/hip_runtime.h
 delete mode 100644 hipnv/include/hip/nvidia_detail/hip_runtime_api.h
 delete mode 100644 hipnv/include/hip/nvidia_detail/hip_texture_types.h
 delete mode 100644 hipnv/include/hip/nvidia_detail/hiprtc.h

diff --git a/hipnv/include/hip/nvidia_detail/channel_descriptor.h b/hipnv/include/hip/nvidia_detail/channel_descriptor.h
deleted file mode 100644
index 7eb0e65fda..0000000000
--- a/hipnv/include/hip/nvidia_detail/channel_descriptor.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
-#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
-
-#include "channel_descriptor.h"
-
-#endif
diff --git a/hipnv/include/hip/nvidia_detail/hip_complex.h b/hipnv/include/hip/nvidia_detail/hip_complex.h
deleted file mode 100644
index 10a53d1743..0000000000
--- a/hipnv/include/hip/nvidia_detail/hip_complex.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
-#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
-
-#include "cuComplex.h"
-
-typedef cuFloatComplex hipFloatComplex;
-
-__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return cuCrealf(z); }
-
-__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return cuCimagf(z); }
-
-__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
-    return make_cuFloatComplex(a, b);
-}
-
-__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { return cuConjf(z); }
-
-__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
-    return cuCabsf(z) * cuCabsf(z);
-}
-
-__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
-    return cuCaddf(p, q);
-}
-
-__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
-    return cuCsubf(p, q);
-}
-
-__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
-    return cuCmulf(p, q);
-}
-
-__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
-    return cuCdivf(p, q);
-}
-
-__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return cuCabsf(z); }
-
-typedef cuDoubleComplex hipDoubleComplex;
-
-__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return cuCreal(z); }
-
-__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return cuCimag(z); }
-
-__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
-    return make_cuDoubleComplex(a, b);
-}
-
-__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { return cuConj(z); }
-
-__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
-    return cuCabs(z) * cuCabs(z);
-}
-
-__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
-    return cuCadd(p, q);
-}
-
-__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
-    return cuCsub(p, q);
-}
-
-__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
-    return cuCmul(p, q);
-}
-
-__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
-    return cuCdiv(p, q);
-}
-
-__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return cuCabs(z); }
-
-typedef cuFloatComplex hipComplex;
-
-__device__ __host__ static inline hipComplex make_Complex(float x, float y) {
-    return make_cuComplex(x, y);
-}
-
-__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
-    return cuComplexDoubleToFloat(z);
-}
-
-__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
-    return cuComplexFloatToDouble(z);
-}
-
-__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
-    return cuCfmaf(p, q, r);
-}
-
-__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
-                                                           hipDoubleComplex r) {
-    return cuCfma(p, q, r);
-}
-
-#endif
diff --git a/hipnv/include/hip/nvidia_detail/hip_cooperative_groups.h b/hipnv/include/hip/nvidia_detail/hip_cooperative_groups.h
deleted file mode 100644
index fc98ae2281..0000000000
--- a/hipnv/include/hip/nvidia_detail/hip_cooperative_groups.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
-#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
-
-// Include CUDA headers
-#include <cuda_runtime.h>
-#include <cooperative_groups.h>
-
-// Include HIP wrapper headers around CUDA
-#include <hip/hip_runtime.h>
-#include <hip/hip_runtime_api.h>
-
-#endif // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime.h b/hipnv/include/hip/nvidia_detail/hip_runtime.h
deleted file mode 100644
index a42fecc611..0000000000
--- a/hipnv/include/hip/nvidia_detail/hip_runtime.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
-#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
-
-#include <cuda_runtime.h>
-
-#include <hip/hip_runtime_api.h>
-
-#define HIP_KERNEL_NAME(...) __VA_ARGS__
-
-typedef int hipLaunchParm;
-
-#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
-    do {                                                                                           \
-        kernelName<<<numBlocks, numThreads, memPerBlock, streamId>>>(__VA_ARGS__);                 \
-    } while (0)
-
-#define hipLaunchKernelGGL(kernelName, ...)  hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
-
-#define hipReadModeElementType cudaReadModeElementType
-
-#ifdef __CUDA_ARCH__
-
-
-// 32-bit Atomics:
-#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (__CUDA_ARCH__ >= 110)
-#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 110)
-#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (__CUDA_ARCH__ >= 120)
-#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 120)
-#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (__CUDA_ARCH__ >= 200)
-
-// 64-bit Atomics:
-#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (__CUDA_ARCH__ >= 200)
-#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (__CUDA_ARCH__ >= 120)
-
-// Doubles
-#define __HIP_ARCH_HAS_DOUBLES__ (__CUDA_ARCH__ >= 120)
-
-// warp cross-lane operations:
-#define __HIP_ARCH_HAS_WARP_VOTE__ (__CUDA_ARCH__ >= 120)
-#define __HIP_ARCH_HAS_WARP_BALLOT__ (__CUDA_ARCH__ >= 200)
-#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (__CUDA_ARCH__ >= 300)
-#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (__CUDA_ARCH__ >= 350)
-
-// sync
-#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (__CUDA_ARCH__ >= 200)
-#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (__CUDA_ARCH__ >= 200)
-
-// misc
-#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (__CUDA_ARCH__ >= 200)
-#define __HIP_ARCH_HAS_3DGRID__ (__CUDA_ARCH__ >= 200)
-#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (__CUDA_ARCH__ >= 350)
-
-#endif
-
-#ifdef __CUDACC__
-
-
-#define hipThreadIdx_x threadIdx.x
-#define hipThreadIdx_y threadIdx.y
-#define hipThreadIdx_z threadIdx.z
-
-#define hipBlockIdx_x blockIdx.x
-#define hipBlockIdx_y blockIdx.y
-#define hipBlockIdx_z blockIdx.z
-
-#define hipBlockDim_x blockDim.x
-#define hipBlockDim_y blockDim.y
-#define hipBlockDim_z blockDim.z
-
-#define hipGridDim_x gridDim.x
-#define hipGridDim_y gridDim.y
-#define hipGridDim_z gridDim.z
-
-#define HIP_SYMBOL(X) &X
-
-/**
- * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
- * To be removed in a future release.
- */
-#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
-#define HIP_DYNAMIC_SHARED_ATTRIBUTE
-
-#ifdef __HIP_DEVICE_COMPILE__
-#define abort_()                                                                                    \
-    { asm("trap;"); }
-#undef assert
-#define assert(COND)                                                                               \
-    {                                                                                              \
-        if (!COND) {                                                                               \
-            abort_();                                                                               \
-        }                                                                                          \
-    }
-#endif
-
-#define __clock() clock()
-#define __clock64() clock64()
-
-#endif
-
-#endif
diff --git a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
deleted file mode 100644
index 66e4743abd..0000000000
--- a/hipnv/include/hip/nvidia_detail/hip_runtime_api.h
+++ /dev/null
@@ -1,2195 +0,0 @@
-/*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-
-#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
-#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
-
-#include <cuda_runtime_api.h>
-#include <cuda.h>
-#include <cuda_profiler_api.h>
-#include <cuda_fp16.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#ifdef __cplusplus
-#define __dparm(x) = x
-#else
-#define __dparm(x)
-#endif
-
-// Add Deprecated Support for CUDA Mapped HIP APIs
-#if defined(__DOXYGEN_ONLY__) || defined(HIP_ENABLE_DEPRECATED)
-#define __HIP_DEPRECATED
-#elif defined(_MSC_VER)
-#define __HIP_DEPRECATED __declspec(deprecated)
-#elif defined(__GNUC__)
-#define __HIP_DEPRECATED __attribute__((deprecated))
-#else
-#define __HIP_DEPRECATED
-#endif
-
-
-// TODO -move to include/hip_runtime_api.h as a common implementation.
-/**
- * Memory copy types
- *
- */
-typedef enum hipMemcpyKind {
-    hipMemcpyHostToHost,
-    hipMemcpyHostToDevice,
-    hipMemcpyDeviceToHost,
-    hipMemcpyDeviceToDevice,
-    hipMemcpyDefault
-} hipMemcpyKind;
-
-typedef enum hipMemoryAdvise {
-    hipMemAdviseSetReadMostly,
-    hipMemAdviseUnsetReadMostly,
-    hipMemAdviseSetPreferredLocation,
-    hipMemAdviseUnsetPreferredLocation,
-    hipMemAdviseSetAccessedBy,
-    hipMemAdviseUnsetAccessedBy
-} hipMemoryAdvise;
-
-// hipDataType
-#define hipDataType cudaDataType
-#define HIP_R_16F CUDA_R_16F
-#define HIP_R_32F CUDA_R_32F
-#define HIP_R_64F CUDA_R_64F
-#define HIP_C_16F CUDA_C_16F
-#define HIP_C_32F CUDA_C_32F
-#define HIP_C_64F CUDA_C_64F
-
-// hipLibraryPropertyType
-#define hipLibraryPropertyType libraryPropertyType
-#define HIP_LIBRARY_MAJOR_VERSION MAJOR_VERSION
-#define HIP_LIBRARY_MINOR_VERSION MINOR_VERSION
-#define HIP_LIBRARY_PATCH_LEVEL PATCH_LEVEL
-
-#define HIP_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR
-#define HIP_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR
-
-//hipArray_Format
-#define HIP_AD_FORMAT_UNSIGNED_INT8   CU_AD_FORMAT_UNSIGNED_INT8
-#define HIP_AD_FORMAT_UNSIGNED_INT16  CU_AD_FORMAT_UNSIGNED_INT16
-#define HIP_AD_FORMAT_UNSIGNED_INT32  CU_AD_FORMAT_UNSIGNED_INT32
-#define HIP_AD_FORMAT_SIGNED_INT8     CU_AD_FORMAT_SIGNED_INT8
-#define HIP_AD_FORMAT_SIGNED_INT16    CU_AD_FORMAT_SIGNED_INT16
-#define HIP_AD_FORMAT_SIGNED_INT32    CU_AD_FORMAT_SIGNED_INT32
-#define HIP_AD_FORMAT_HALF            CU_AD_FORMAT_HALF
-#define HIP_AD_FORMAT_FLOAT           CU_AD_FORMAT_FLOAT
-
-// hipArray_Format
-#define hipArray_Format CUarray_format
-
-inline static CUarray_format hipArray_FormatToCUarray_format(
-    hipArray_Format format) {
-    switch (format) {
-        case HIP_AD_FORMAT_UNSIGNED_INT8:
-            return CU_AD_FORMAT_UNSIGNED_INT8;
-        case HIP_AD_FORMAT_UNSIGNED_INT16:
-            return CU_AD_FORMAT_UNSIGNED_INT16;
-        case HIP_AD_FORMAT_UNSIGNED_INT32:
-            return CU_AD_FORMAT_UNSIGNED_INT32;
-        case HIP_AD_FORMAT_SIGNED_INT8:
-            return CU_AD_FORMAT_SIGNED_INT8;
-        case HIP_AD_FORMAT_SIGNED_INT16:
-            return CU_AD_FORMAT_SIGNED_INT16;
-        case HIP_AD_FORMAT_SIGNED_INT32:
-            return CU_AD_FORMAT_SIGNED_INT32;
-        case HIP_AD_FORMAT_HALF:
-            return CU_AD_FORMAT_HALF;
-        case HIP_AD_FORMAT_FLOAT:
-            return CU_AD_FORMAT_FLOAT;
-        default:
-            return CU_AD_FORMAT_UNSIGNED_INT8;
-    }
-}
-
-#define HIP_TR_ADDRESS_MODE_WRAP   CU_TR_ADDRESS_MODE_WRAP
-#define HIP_TR_ADDRESS_MODE_CLAMP  CU_TR_ADDRESS_MODE_CLAMP
-#define HIP_TR_ADDRESS_MODE_MIRROR CU_TR_ADDRESS_MODE_MIRROR
-#define HIP_TR_ADDRESS_MODE_BORDER CU_TR_ADDRESS_MODE_BORDER
-
-// hipAddress_mode
-#define hipAddress_mode CUaddress_mode
-
-inline static CUaddress_mode hipAddress_modeToCUaddress_mode(
-    hipAddress_mode mode) {
-    switch (mode) {
-        case HIP_TR_ADDRESS_MODE_WRAP:
-            return CU_TR_ADDRESS_MODE_WRAP;
-        case HIP_TR_ADDRESS_MODE_CLAMP:
-            return CU_TR_ADDRESS_MODE_CLAMP;
-        case HIP_TR_ADDRESS_MODE_MIRROR:
-            return CU_TR_ADDRESS_MODE_MIRROR;
-        case HIP_TR_ADDRESS_MODE_BORDER:
-            return CU_TR_ADDRESS_MODE_BORDER;
-        default:
-            return CU_TR_ADDRESS_MODE_WRAP;
-    }
-}
-
-#define HIP_TR_FILTER_MODE_POINT   CU_TR_FILTER_MODE_POINT
-#define HIP_TR_FILTER_MODE_LINEAR  CU_TR_FILTER_MODE_LINEAR
-
-// hipFilter_mode
-#define hipFilter_mode CUfilter_mode
-
-inline static CUfilter_mode hipFilter_mode_enumToCUfilter_mode(
-    hipFilter_mode mode) {
-    switch (mode) {
-        case HIP_TR_FILTER_MODE_POINT:
-            return CU_TR_FILTER_MODE_POINT;
-        case HIP_TR_FILTER_MODE_LINEAR:
-            return CU_TR_FILTER_MODE_LINEAR;
-        default:
-            return CU_TR_FILTER_MODE_POINT;
-    }
-}
-
-//hipResourcetype
-#define HIP_RESOURCE_TYPE_ARRAY            CU_RESOURCE_TYPE_ARRAY
-#define HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY  CU_RESOURCE_TYPE_MIPMAPPED_ARRAY
-#define HIP_RESOURCE_TYPE_LINEAR           CU_RESOURCE_TYPE_LINEAR
-#define HIP_RESOURCE_TYPE_PITCH2D          CU_RESOURCE_TYPE_PITCH2D
-
-// hipResourcetype
-#define hipResourcetype CUresourcetype
-
-inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
-    hipResourcetype resType) {
-    switch (resType) {
-        case HIP_RESOURCE_TYPE_ARRAY:
-            return CU_RESOURCE_TYPE_ARRAY;
-        case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
-            return CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
-        case HIP_RESOURCE_TYPE_LINEAR:
-            return CU_RESOURCE_TYPE_LINEAR;
-        case HIP_RESOURCE_TYPE_PITCH2D:
-            return CU_RESOURCE_TYPE_PITCH2D;
-        default:
-            return CU_RESOURCE_TYPE_ARRAY;
-    }
-}
-
-#define hipTexRef CUtexref
-#define hiparray CUarray
-
-// hipTextureAddressMode
-typedef enum cudaTextureAddressMode hipTextureAddressMode;
-#define hipAddressModeWrap cudaAddressModeWrap
-#define hipAddressModeClamp cudaAddressModeClamp
-#define hipAddressModeMirror cudaAddressModeMirror
-#define hipAddressModeBorder cudaAddressModeBorder
-
-// hipTextureFilterMode
-typedef enum cudaTextureFilterMode hipTextureFilterMode;
-#define hipFilterModePoint cudaFilterModePoint
-#define hipFilterModeLinear cudaFilterModeLinear
-
-// hipTextureReadMode
-typedef enum cudaTextureReadMode hipTextureReadMode;
-#define hipReadModeElementType cudaReadModeElementType
-#define hipReadModeNormalizedFloat cudaReadModeNormalizedFloat
-
-// hipChannelFormatKind
-typedef enum cudaChannelFormatKind hipChannelFormatKind;
-#define hipChannelFormatKindSigned      cudaChannelFormatKindSigned
-#define hipChannelFormatKindUnsigned    cudaChannelFormatKindUnsigned
-#define hipChannelFormatKindFloat       cudaChannelFormatKindFloat
-#define hipChannelFormatKindNone        cudaChannelFormatKindNone
-
-// hipMemRangeAttribute
-typedef enum cudaMemRangeAttribute hipMemRangeAttribute;
-#define hipMemRangeAttributeReadMostly cudaMemRangeAttributeReadMostly
-#define hipMemRangeAttributePreferredLocation cudaMemRangeAttributePreferredLocation
-#define hipMemRangeAttributeAccessedBy cudaMemRangeAttributeAccessedBy
-#define hipMemRangeAttributeLastPrefetchLocation cudaMemRangeAttributeLastPrefetchLocation
-
-#define hipSurfaceBoundaryMode cudaSurfaceBoundaryMode
-#define hipBoundaryModeZero cudaBoundaryModeZero
-#define hipBoundaryModeTrap cudaBoundaryModeTrap
-#define hipBoundaryModeClamp cudaBoundaryModeClamp
-
-// hipFuncCache
-#define hipFuncCachePreferNone cudaFuncCachePreferNone
-#define hipFuncCachePreferShared cudaFuncCachePreferShared
-#define hipFuncCachePreferL1 cudaFuncCachePreferL1
-#define hipFuncCachePreferEqual cudaFuncCachePreferEqual
-
-// hipResourceType
-#define hipResourceType cudaResourceType
-#define hipResourceTypeArray cudaResourceTypeArray
-#define hipResourceTypeMipmappedArray cudaResourceTypeMipmappedArray
-#define hipResourceTypeLinear cudaResourceTypeLinear
-#define hipResourceTypePitch2D cudaResourceTypePitch2D
-//
-// hipErrorNoDevice.
-
-
-//! Flags that can be used with hipEventCreateWithFlags:
-#define hipEventDefault cudaEventDefault
-#define hipEventBlockingSync cudaEventBlockingSync
-#define hipEventDisableTiming cudaEventDisableTiming
-#define hipEventInterprocess cudaEventInterprocess
-#define hipEventReleaseToDevice 0 /* no-op on CUDA platform */
-#define hipEventReleaseToSystem 0 /* no-op on CUDA platform */
-
-
-#define hipHostMallocDefault cudaHostAllocDefault
-#define hipHostMallocPortable cudaHostAllocPortable
-#define hipHostMallocMapped cudaHostAllocMapped
-#define hipHostMallocWriteCombined cudaHostAllocWriteCombined
-#define hipHostMallocCoherent 0x0
-#define hipHostMallocNonCoherent 0x0
-
-#define hipMemAttachGlobal cudaMemAttachGlobal
-#define hipMemAttachHost cudaMemAttachHost
-#define hipMemAttachSingle cudaMemAttachSingle
-
-#define hipHostRegisterDefault cudaHostRegisterDefault
-#define hipHostRegisterPortable cudaHostRegisterPortable
-#define hipHostRegisterMapped cudaHostRegisterMapped
-#define hipHostRegisterIoMemory cudaHostRegisterIoMemory
-
-#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
-#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE
-#define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END
-#define hipLimitMallocHeapSize cudaLimitMallocHeapSize
-#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess
-
-#define hipOccupancyDefault cudaOccupancyDefault
-
-#define hipCooperativeLaunchMultiDeviceNoPreSync    \
-        cudaCooperativeLaunchMultiDeviceNoPreSync
-#define hipCooperativeLaunchMultiDeviceNoPostSync   \
-        cudaCooperativeLaunchMultiDeviceNoPostSync
-
-
-// enum CUjit_option redefines
-#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS
-#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK
-#define hipJitOptionWallTime CU_JIT_WALL_TIME
-#define hipJitOptionInfoLogBuffer CU_JIT_INFO_LOG_BUFFER
-#define hipJitOptionInfoLogBufferSizeBytes CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
-#define hipJitOptionErrorLogBuffer CU_JIT_ERROR_LOG_BUFFER
-#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
-#define hipJitOptionOptimizationLevel CU_JIT_OPTIMIZATION_LEVEL
-#define hipJitOptionTargetFromContext CU_JIT_TARGET_FROM_CUCONTEXT
-#define hipJitOptionTarget CU_JIT_TARGET
-#define hipJitOptionFallbackStrategy CU_JIT_FALLBACK_STRATEGY
-#define hipJitOptionGenerateDebugInfo CU_JIT_GENERATE_DEBUG_INFO
-#define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE
-#define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO
-#define hipJitOptionCacheMode CU_JIT_CACHE_MODE
-#define hipJitOptionSm3xOpt CU_JIT_NEW_SM3X_OPT
-#define hipJitOptionFastCompile CU_JIT_FAST_COMPILE
-#define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS
-
-typedef cudaEvent_t hipEvent_t;
-typedef cudaStream_t hipStream_t;
-typedef cudaIpcEventHandle_t hipIpcEventHandle_t;
-typedef cudaIpcMemHandle_t hipIpcMemHandle_t;
-typedef enum cudaLimit hipLimit_t;
-typedef enum cudaFuncAttribute hipFuncAttribute;
-typedef enum cudaFuncCache hipFuncCache_t;
-typedef CUcontext hipCtx_t;
-typedef enum cudaSharedMemConfig hipSharedMemConfig;
-typedef CUfunc_cache hipFuncCache;
-typedef CUjit_option hipJitOption;
-typedef CUdevice hipDevice_t;
-typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr;
-#define hipDevP2PAttrPerformanceRank cudaDevP2PAttrPerformanceRank
-#define hipDevP2PAttrAccessSupported cudaDevP2PAttrAccessSupported
-#define hipDevP2PAttrNativeAtomicSupported cudaDevP2PAttrNativeAtomicSupported
-#define hipDevP2PAttrHipArrayAccessSupported cudaDevP2PAttrCudaArrayAccessSupported
-#define hipFuncAttributeMaxDynamicSharedMemorySize cudaFuncAttributeMaxDynamicSharedMemorySize
-#define hipFuncAttributePreferredSharedMemoryCarveout cudaFuncAttributePreferredSharedMemoryCarveout
-
-typedef CUmodule hipModule_t;
-typedef CUfunction hipFunction_t;
-typedef CUdeviceptr hipDeviceptr_t;
-typedef struct cudaArray hipArray;
-typedef struct cudaArray* hipArray_t;
-typedef struct cudaArray* hipArray_const_t;
-typedef struct cudaFuncAttributes hipFuncAttributes;
-typedef struct cudaLaunchParams hipLaunchParams;
-#define hipFunction_attribute CUfunction_attribute
-#define hip_Memcpy2D CUDA_MEMCPY2D
-#define HIP_MEMCPY3D CUDA_MEMCPY3D
-#define hipMemcpy3DParms cudaMemcpy3DParms
-#define hipArrayDefault cudaArrayDefault
-#define hipArrayLayered cudaArrayLayered
-#define hipArraySurfaceLoadStore cudaArraySurfaceLoadStore
-#define hipArrayCubemap cudaArrayCubemap
-#define hipArrayTextureGather cudaArrayTextureGather
-
-typedef cudaTextureObject_t hipTextureObject_t;
-typedef cudaSurfaceObject_t hipSurfaceObject_t;
-#define hipTextureType1D cudaTextureType1D
-#define hipTextureType1DLayered cudaTextureType1DLayered
-#define hipTextureType2D cudaTextureType2D
-#define hipTextureType2DLayered cudaTextureType2DLayered
-#define hipTextureType3D cudaTextureType3D
-
-#define hipDeviceScheduleAuto cudaDeviceScheduleAuto
-#define hipDeviceScheduleSpin cudaDeviceScheduleSpin
-#define hipDeviceScheduleYield cudaDeviceScheduleYield
-#define hipDeviceScheduleBlockingSync cudaDeviceScheduleBlockingSync
-#define hipDeviceScheduleMask cudaDeviceScheduleMask
-#define hipDeviceMapHost cudaDeviceMapHost
-#define hipDeviceLmemResizeToMax cudaDeviceLmemResizeToMax
-
-#define hipCpuDeviceId cudaCpuDeviceId
-#define hipInvalidDeviceId cudaInvalidDeviceId
-typedef struct cudaExtent hipExtent;
-typedef struct cudaPitchedPtr hipPitchedPtr;
-#define make_hipExtent make_cudaExtent
-#define make_hipPos make_cudaPos
-#define make_hipPitchedPtr make_cudaPitchedPtr
-// Flags that can be used with hipStreamCreateWithFlags
-#define hipStreamDefault cudaStreamDefault
-#define hipStreamNonBlocking cudaStreamNonBlocking
-
-typedef struct cudaChannelFormatDesc hipChannelFormatDesc;
-typedef struct cudaResourceDesc hipResourceDesc;
-typedef struct cudaTextureDesc hipTextureDesc;
-typedef struct cudaResourceViewDesc hipResourceViewDesc;
-// adding code for hipmemSharedConfig
-#define hipSharedMemBankSizeDefault cudaSharedMemBankSizeDefault
-#define hipSharedMemBankSizeFourByte cudaSharedMemBankSizeFourByte
-#define hipSharedMemBankSizeEightByte cudaSharedMemBankSizeEightByte
-
-//Function Attributes
-#define HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
-#define HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
-#define HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
-#define HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES
-#define HIP_FUNC_ATTRIBUTE_NUM_REGS CU_FUNC_ATTRIBUTE_NUM_REGS
-#define HIP_FUNC_ATTRIBUTE_PTX_VERSION CU_FUNC_ATTRIBUTE_PTX_VERSION
-#define HIP_FUNC_ATTRIBUTE_BINARY_VERSION CU_FUNC_ATTRIBUTE_BINARY_VERSION
-#define HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA CU_FUNC_ATTRIBUTE_CACHE_MODE_CA
-#define HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
-#define HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
-#define HIP_FUNC_ATTRIBUTE_MAX CU_FUNC_ATTRIBUTE_MAX
-
-#if CUDA_VERSION >= 9000
-#define __shfl(...)      __shfl_sync(0xffffffff, __VA_ARGS__)
-#define __shfl_up(...)   __shfl_up_sync(0xffffffff, __VA_ARGS__)
-#define __shfl_down(...) __shfl_down_sync(0xffffffff, __VA_ARGS__)
-#define __shfl_xor(...)  __shfl_xor_sync(0xffffffff, __VA_ARGS__)
-#endif // CUDA_VERSION >= 9000
-
-inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
-    switch (cuError) {
-        case cudaSuccess:
-            return hipSuccess;
-        case cudaErrorProfilerDisabled:
-            return hipErrorProfilerDisabled;
-        case cudaErrorProfilerNotInitialized:
-            return hipErrorProfilerNotInitialized;
-        case cudaErrorProfilerAlreadyStarted:
-            return hipErrorProfilerAlreadyStarted;
-        case cudaErrorProfilerAlreadyStopped:
-            return hipErrorProfilerAlreadyStopped;
-        case cudaErrorInsufficientDriver:
-            return hipErrorInsufficientDriver;
-        case cudaErrorUnsupportedLimit:
-            return hipErrorUnsupportedLimit;
-        case cudaErrorPeerAccessUnsupported:
-            return hipErrorPeerAccessUnsupported;
-        case cudaErrorInvalidGraphicsContext:
-            return hipErrorInvalidGraphicsContext;
-        case cudaErrorSharedObjectSymbolNotFound:
-            return hipErrorSharedObjectSymbolNotFound;
-        case cudaErrorSharedObjectInitFailed:
-            return hipErrorSharedObjectInitFailed;
-        case cudaErrorOperatingSystem:
-            return hipErrorOperatingSystem;
-        case cudaErrorSetOnActiveProcess:
-            return hipErrorSetOnActiveProcess;
-        case cudaErrorIllegalAddress:
-            return hipErrorIllegalAddress;
-        case cudaErrorInvalidSymbol:
-            return hipErrorInvalidSymbol;
-        case cudaErrorMissingConfiguration:
-            return hipErrorMissingConfiguration;
-        case cudaErrorMemoryAllocation:
-            return hipErrorOutOfMemory;
-        case cudaErrorInitializationError:
-            return hipErrorNotInitialized;
-        case cudaErrorLaunchFailure:
-            return hipErrorLaunchFailure;
-        case cudaErrorCooperativeLaunchTooLarge:
-            return hipErrorCooperativeLaunchTooLarge;
-        case cudaErrorPriorLaunchFailure:
-            return hipErrorPriorLaunchFailure;
-        case cudaErrorLaunchOutOfResources:
-            return hipErrorLaunchOutOfResources;
-        case cudaErrorInvalidDeviceFunction:
-            return hipErrorInvalidDeviceFunction;
-        case cudaErrorInvalidConfiguration:
-            return hipErrorInvalidConfiguration;
-        case cudaErrorInvalidDevice:
-            return hipErrorInvalidDevice;
-        case cudaErrorInvalidValue:
-            return hipErrorInvalidValue;
-        case cudaErrorInvalidDevicePointer:
-            return hipErrorInvalidDevicePointer;
-        case cudaErrorInvalidMemcpyDirection:
-            return hipErrorInvalidMemcpyDirection;
-        case cudaErrorInvalidResourceHandle:
-            return hipErrorInvalidHandle;
-        case cudaErrorNotReady:
-            return hipErrorNotReady;
-        case cudaErrorNoDevice:
-            return hipErrorNoDevice;
-        case cudaErrorPeerAccessAlreadyEnabled:
-            return hipErrorPeerAccessAlreadyEnabled;
-        case cudaErrorPeerAccessNotEnabled:
-            return hipErrorPeerAccessNotEnabled;
-        case cudaErrorHostMemoryAlreadyRegistered:
-            return hipErrorHostMemoryAlreadyRegistered;
-        case cudaErrorHostMemoryNotRegistered:
-            return hipErrorHostMemoryNotRegistered;
-        case cudaErrorMapBufferObjectFailed:
-            return hipErrorMapFailed;
-        case cudaErrorAssert:
-            return hipErrorAssert;
-        case cudaErrorNotSupported:
-            return hipErrorNotSupported;
-        case cudaErrorCudartUnloading:
-            return hipErrorDeinitialized;
-        case cudaErrorInvalidKernelImage:
-            return hipErrorInvalidImage;
-        case cudaErrorUnmapBufferObjectFailed:
-            return hipErrorUnmapFailed;
-        case cudaErrorNoKernelImageForDevice:
-            return hipErrorNoBinaryForGpu;
-        case cudaErrorECCUncorrectable:
-            return hipErrorECCNotCorrectable;
-        case cudaErrorDeviceAlreadyInUse:
-            return hipErrorContextAlreadyInUse;
-        case cudaErrorInvalidPtx:
-            return hipErrorInvalidKernelFile;
-        case cudaErrorLaunchTimeout:
-            return hipErrorLaunchTimeOut;
-#if CUDA_VERSION >= 10010
-        case cudaErrorInvalidSource:
-            return hipErrorInvalidSource;
-        case cudaErrorFileNotFound:
-            return hipErrorFileNotFound;
-        case cudaErrorSymbolNotFound:
-            return hipErrorNotFound;
-        case cudaErrorArrayIsMapped:
-            return hipErrorArrayIsMapped;
-        case cudaErrorNotMappedAsPointer:
-            return hipErrorNotMappedAsPointer;
-        case cudaErrorNotMappedAsArray:
-            return hipErrorNotMappedAsArray;
-        case cudaErrorNotMapped:
-            return hipErrorNotMapped;
-        case cudaErrorAlreadyAcquired:
-            return hipErrorAlreadyAcquired;
-        case cudaErrorAlreadyMapped:
-            return hipErrorAlreadyMapped;
-#endif
-#if CUDA_VERSION >= 10020
-        case cudaErrorDeviceUninitialized:
-            return hipErrorInvalidContext;
-#endif
-        case cudaErrorUnknown:
-        default:
-            return hipErrorUnknown;  // Note - translated error.
-    }
-}
-
-inline static hipError_t hipCUResultTohipError(CUresult cuError) {
-    switch (cuError) {
-        case CUDA_SUCCESS:
-            return hipSuccess;
-        case CUDA_ERROR_OUT_OF_MEMORY:
-            return hipErrorOutOfMemory;
-        case CUDA_ERROR_INVALID_VALUE:
-            return hipErrorInvalidValue;
-        case CUDA_ERROR_INVALID_DEVICE:
-            return hipErrorInvalidDevice;
-        case CUDA_ERROR_DEINITIALIZED:
-            return hipErrorDeinitialized;
-        case CUDA_ERROR_NO_DEVICE:
-            return hipErrorNoDevice;
-        case CUDA_ERROR_INVALID_CONTEXT:
-            return hipErrorInvalidContext;
-        case CUDA_ERROR_NOT_INITIALIZED:
-            return hipErrorNotInitialized;
-        case CUDA_ERROR_INVALID_HANDLE:
-            return hipErrorInvalidHandle;
-        case CUDA_ERROR_MAP_FAILED:
-            return hipErrorMapFailed;
-        case CUDA_ERROR_PROFILER_DISABLED:
-            return hipErrorProfilerDisabled;
-        case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
-            return hipErrorProfilerNotInitialized;
-        case CUDA_ERROR_PROFILER_ALREADY_STARTED:
-            return hipErrorProfilerAlreadyStarted;
-        case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
-            return hipErrorProfilerAlreadyStopped;
-        case CUDA_ERROR_INVALID_IMAGE:
-            return hipErrorInvalidImage;
-        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
-            return hipErrorContextAlreadyCurrent;
-        case CUDA_ERROR_UNMAP_FAILED:
-            return hipErrorUnmapFailed;
-        case CUDA_ERROR_ARRAY_IS_MAPPED:
-            return hipErrorArrayIsMapped;
-        case CUDA_ERROR_ALREADY_MAPPED:
-            return hipErrorAlreadyMapped;
-        case CUDA_ERROR_NO_BINARY_FOR_GPU:
-            return hipErrorNoBinaryForGpu;
-        case CUDA_ERROR_ALREADY_ACQUIRED:
-            return hipErrorAlreadyAcquired;
-        case CUDA_ERROR_NOT_MAPPED:
-            return hipErrorNotMapped;
-        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
-            return hipErrorNotMappedAsArray;
-        case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
-            return hipErrorNotMappedAsPointer;
-        case CUDA_ERROR_ECC_UNCORRECTABLE:
-            return hipErrorECCNotCorrectable;
-        case CUDA_ERROR_UNSUPPORTED_LIMIT:
-            return hipErrorUnsupportedLimit;
-        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
-            return hipErrorContextAlreadyInUse;
-        case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
-            return hipErrorPeerAccessUnsupported;
-        case CUDA_ERROR_INVALID_PTX:
-            return hipErrorInvalidKernelFile;
-        case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
-            return hipErrorInvalidGraphicsContext;
-        case CUDA_ERROR_INVALID_SOURCE:
-            return hipErrorInvalidSource;
-        case CUDA_ERROR_FILE_NOT_FOUND:
-            return hipErrorFileNotFound;
-        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
-            return hipErrorSharedObjectSymbolNotFound;
-        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
-            return hipErrorSharedObjectInitFailed;
-        case CUDA_ERROR_OPERATING_SYSTEM:
-            return hipErrorOperatingSystem;
-        case CUDA_ERROR_NOT_FOUND:
-            return hipErrorNotFound;
-        case CUDA_ERROR_NOT_READY:
-            return hipErrorNotReady;
-        case CUDA_ERROR_ILLEGAL_ADDRESS:
-            return hipErrorIllegalAddress;
-        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
-            return hipErrorLaunchOutOfResources;
-        case CUDA_ERROR_LAUNCH_TIMEOUT:
-            return hipErrorLaunchTimeOut;
-        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
-            return hipErrorPeerAccessAlreadyEnabled;
-        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
-            return hipErrorPeerAccessNotEnabled;
-        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
-            return hipErrorSetOnActiveProcess;
-        case CUDA_ERROR_ASSERT:
-            return hipErrorAssert;
-        case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
-            return hipErrorHostMemoryAlreadyRegistered;
-        case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
-            return hipErrorHostMemoryNotRegistered;
-        case CUDA_ERROR_LAUNCH_FAILED:
-            return hipErrorLaunchFailure;
-        case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE:
-            return hipErrorCooperativeLaunchTooLarge;
-        case CUDA_ERROR_NOT_SUPPORTED:
-            return hipErrorNotSupported;
-        case CUDA_ERROR_UNKNOWN:
-        default:
-            return hipErrorUnknown;  // Note - translated error.
-    }
-}
-
-inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
-    switch (hError) {
-        case hipSuccess:
-            return cudaSuccess;
-        case hipErrorOutOfMemory:
-            return cudaErrorMemoryAllocation;
-        case hipErrorProfilerDisabled:
-          return cudaErrorProfilerDisabled;
-        case hipErrorProfilerNotInitialized:
-            return cudaErrorProfilerNotInitialized;
-        case hipErrorProfilerAlreadyStarted:
-            return cudaErrorProfilerAlreadyStarted;
-        case hipErrorProfilerAlreadyStopped:
-            return cudaErrorProfilerAlreadyStopped;
-        case hipErrorInvalidConfiguration:
-            return cudaErrorInvalidConfiguration;
-        case hipErrorLaunchOutOfResources:
-            return cudaErrorLaunchOutOfResources;
-        case hipErrorInvalidValue:
-            return cudaErrorInvalidValue;
-        case hipErrorInvalidHandle:
-            return cudaErrorInvalidResourceHandle;
-        case hipErrorInvalidDevice:
-            return cudaErrorInvalidDevice;
-        case hipErrorInvalidMemcpyDirection:
-            return cudaErrorInvalidMemcpyDirection;
-        case hipErrorInvalidDevicePointer:
-            return cudaErrorInvalidDevicePointer;
-        case hipErrorNotInitialized:
-            return cudaErrorInitializationError;
-        case hipErrorNoDevice:
-            return cudaErrorNoDevice;
-        case hipErrorNotReady:
-            return cudaErrorNotReady;
-        case hipErrorPeerAccessNotEnabled:
-            return cudaErrorPeerAccessNotEnabled;
-        case hipErrorPeerAccessAlreadyEnabled:
-            return cudaErrorPeerAccessAlreadyEnabled;
-        case hipErrorHostMemoryAlreadyRegistered:
-            return cudaErrorHostMemoryAlreadyRegistered;
-        case hipErrorHostMemoryNotRegistered:
-            return cudaErrorHostMemoryNotRegistered;
-        case hipErrorDeinitialized:
-            return cudaErrorCudartUnloading;
-        case hipErrorInvalidSymbol:
-            return cudaErrorInvalidSymbol;
-        case hipErrorInsufficientDriver:
-            return cudaErrorInsufficientDriver;
-        case hipErrorMissingConfiguration:
-            return cudaErrorMissingConfiguration;
-        case hipErrorPriorLaunchFailure:
-            return cudaErrorPriorLaunchFailure;
-        case hipErrorInvalidDeviceFunction:
-            return cudaErrorInvalidDeviceFunction;
-        case hipErrorInvalidImage:
-            return cudaErrorInvalidKernelImage;
-        case hipErrorInvalidContext:
-#if CUDA_VERSION >= 10020
-            return cudaErrorDeviceUninitialized;
-#else
-            return cudaErrorUnknown;
-#endif
-        case hipErrorMapFailed:
-            return cudaErrorMapBufferObjectFailed;
-        case hipErrorUnmapFailed:
-            return cudaErrorUnmapBufferObjectFailed;
-        case hipErrorArrayIsMapped:
-#if CUDA_VERSION >= 10010
-            return cudaErrorArrayIsMapped;
-#else
-            return cudaErrorUnknown;
-#endif
-        case hipErrorAlreadyMapped:
-#if CUDA_VERSION >= 10010
-            return cudaErrorAlreadyMapped;
-#else
-            return cudaErrorUnknown;
-#endif
-        case hipErrorNoBinaryForGpu:
-            return cudaErrorNoKernelImageForDevice;
-        case hipErrorAlreadyAcquired:
-#if CUDA_VERSION >= 10010
-            return cudaErrorAlreadyAcquired;
-#else
-            return cudaErrorUnknown;
-#endif
-        case hipErrorNotMapped:
-#if CUDA_VERSION >= 10010
-            return cudaErrorNotMapped;
-#else
-            return cudaErrorUnknown;
-#endif
-        case hipErrorNotMappedAsArray:
-#if CUDA_VERSION >= 10010
-            return cudaErrorNotMappedAsArray;
-#else
-            return cudaErrorUnknown;
-#endif
-        case hipErrorNotMappedAsPointer:
-#if CUDA_VERSION >= 10010
-            return cudaErrorNotMappedAsPointer;
-#else
-            return cudaErrorUnknown;
-#endif
-        case hipErrorECCNotCorrectable:
-            return cudaErrorECCUncorrectable;
-        case hipErrorUnsupportedLimit:
-            return cudaErrorUnsupportedLimit;
-        case hipErrorContextAlreadyInUse:
-            return cudaErrorDeviceAlreadyInUse;
-        case hipErrorPeerAccessUnsupported:
-            return cudaErrorPeerAccessUnsupported;
-        case hipErrorInvalidKernelFile:
-            return cudaErrorInvalidPtx;
-        case hipErrorInvalidGraphicsContext:
-            return cudaErrorInvalidGraphicsContext;
-        case hipErrorInvalidSource:
-#if CUDA_VERSION >= 10010
-            return cudaErrorInvalidSource;
-#else
-            return cudaErrorUnknown;
-#endif
-        case hipErrorFileNotFound:
-#if CUDA_VERSION >= 10010
-            return cudaErrorFileNotFound;
-#else
-            return cudaErrorUnknown;
-#endif
-        case hipErrorSharedObjectSymbolNotFound:
-            return cudaErrorSharedObjectSymbolNotFound;
-        case hipErrorSharedObjectInitFailed:
-            return cudaErrorSharedObjectInitFailed;
-        case hipErrorOperatingSystem:
-            return cudaErrorOperatingSystem;
-        case hipErrorNotFound:
-#if CUDA_VERSION >= 10010
-            return cudaErrorSymbolNotFound;
-#else
-            return cudaErrorUnknown;
-#endif
-        case hipErrorIllegalAddress:
-            return cudaErrorIllegalAddress;
-        case hipErrorLaunchTimeOut:
-            return cudaErrorLaunchTimeout;
-        case hipErrorSetOnActiveProcess:
-            return cudaErrorSetOnActiveProcess;
-        case hipErrorLaunchFailure:
-            return cudaErrorLaunchFailure;
-        case hipErrorCooperativeLaunchTooLarge:
-            return cudaErrorCooperativeLaunchTooLarge;
-        case hipErrorNotSupported:
-            return cudaErrorNotSupported;
-        // HSA: does not exist in CUDA
-        case hipErrorRuntimeMemory:
-        // HSA: does not exist in CUDA
-        case hipErrorRuntimeOther:
-        case hipErrorUnknown:
-        case hipErrorTbd:
-        default:
-            return cudaErrorUnknown;  // Note - translated error.
-    }
-}
-
-inline static enum cudaMemcpyKind hipMemcpyKindToCudaMemcpyKind(hipMemcpyKind kind) {
-    switch (kind) {
-        case hipMemcpyHostToHost:
-            return cudaMemcpyHostToHost;
-        case hipMemcpyHostToDevice:
-            return cudaMemcpyHostToDevice;
-        case hipMemcpyDeviceToHost:
-            return cudaMemcpyDeviceToHost;
-        case hipMemcpyDeviceToDevice:
-            return cudaMemcpyDeviceToDevice;
-        default:
-            return cudaMemcpyDefault;
-    }
-}
-
-inline static enum cudaTextureAddressMode hipTextureAddressModeToCudaTextureAddressMode(
-    hipTextureAddressMode kind) {
-    switch (kind) {
-        case hipAddressModeWrap:
-            return cudaAddressModeWrap;
-        case hipAddressModeClamp:
-            return cudaAddressModeClamp;
-        case hipAddressModeMirror:
-            return cudaAddressModeMirror;
-        case hipAddressModeBorder:
-            return cudaAddressModeBorder;
-        default:
-            return cudaAddressModeWrap;
-    }
-}
-
-inline static enum cudaMemRangeAttribute hipMemRangeAttributeToCudaMemRangeAttribute(
-   hipMemRangeAttribute kind) {
-   switch (kind) {
-       case hipMemRangeAttributeReadMostly:
-           return cudaMemRangeAttributeReadMostly;
-       case hipMemRangeAttributePreferredLocation:
-           return cudaMemRangeAttributePreferredLocation;
-       case hipMemRangeAttributeAccessedBy:
-           return cudaMemRangeAttributeAccessedBy;
-       case hipMemRangeAttributeLastPrefetchLocation:
-           return cudaMemRangeAttributeLastPrefetchLocation;
-       default:
-           return cudaMemRangeAttributeReadMostly;
-   }
-}
-
-inline static enum cudaMemoryAdvise hipMemoryAdviseTocudaMemoryAdvise(
-    hipMemoryAdvise kind) {
-   switch (kind) {
-       case hipMemAdviseSetReadMostly:
-           return cudaMemAdviseSetReadMostly;
-       case hipMemAdviseUnsetReadMostly :
-           return cudaMemAdviseUnsetReadMostly ;
-       case hipMemAdviseSetPreferredLocation:
-           return cudaMemAdviseSetPreferredLocation;
-       case hipMemAdviseUnsetPreferredLocation:
-           return cudaMemAdviseUnsetPreferredLocation;
-       case hipMemAdviseSetAccessedBy:
-           return cudaMemAdviseSetAccessedBy;
-       case hipMemAdviseUnsetAccessedBy:
-           return cudaMemAdviseUnsetAccessedBy;
-       default:
-           return cudaMemAdviseSetReadMostly;
-   }
-}
-
-inline static enum cudaTextureFilterMode hipTextureFilterModeToCudaTextureFilterMode(
-    hipTextureFilterMode kind) {
-    switch (kind) {
-        case hipFilterModePoint:
-            return cudaFilterModePoint;
-        case hipFilterModeLinear:
-            return cudaFilterModeLinear;
-        default:
-            return cudaFilterModePoint;
-    }
-}
-
-inline static enum cudaTextureReadMode hipTextureReadModeToCudaTextureReadMode(hipTextureReadMode kind) {
-    switch (kind) {
-        case hipReadModeElementType:
-            return cudaReadModeElementType;
-        case hipReadModeNormalizedFloat:
-            return cudaReadModeNormalizedFloat;
-        default:
-            return cudaReadModeElementType;
-    }
-}
-
-inline static enum cudaChannelFormatKind hipChannelFormatKindToCudaChannelFormatKind(
-    hipChannelFormatKind kind) {
-    switch (kind) {
-        case hipChannelFormatKindSigned:
-            return cudaChannelFormatKindSigned;
-        case hipChannelFormatKindUnsigned:
-            return cudaChannelFormatKindUnsigned;
-        case hipChannelFormatKindFloat:
-            return cudaChannelFormatKindFloat;
-        case hipChannelFormatKindNone:
-            return cudaChannelFormatKindNone;
-        default:
-            return cudaChannelFormatKindNone;
-    }
-}
-
-/**
- * Stream CallBack struct
- */
-#define HIPRT_CB CUDART_CB
-typedef void(HIPRT_CB* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData);
-inline static hipError_t hipInit(unsigned int flags) {
-    return hipCUResultTohipError(cuInit(flags));
-}
-
-inline static hipError_t hipDeviceReset() { return hipCUDAErrorTohipError(cudaDeviceReset()); }
-
-inline static hipError_t hipGetLastError() { return hipCUDAErrorTohipError(cudaGetLastError()); }
-
-inline static hipError_t hipPeekAtLastError() {
-    return hipCUDAErrorTohipError(cudaPeekAtLastError());
-}
-
-inline static hipError_t hipMalloc(void** ptr, size_t size) {
-    return hipCUDAErrorTohipError(cudaMalloc(ptr, size));
-}
-
-inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) {
-    return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height));
-}
-
-inline static hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr,size_t* pitch,size_t widthInBytes,size_t height,unsigned int elementSizeBytes){
-    return hipCUResultTohipError(cuMemAllocPitch(dptr,pitch,widthInBytes,height,elementSizeBytes));
-}
-
-inline static hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent) {
-    return hipCUDAErrorTohipError(cudaMalloc3D(pitchedDevPtr, extent));
-}
-
-inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); }
-
-inline static hipError_t hipMallocHost(void** ptr, size_t size)
-    __attribute__((deprecated("use hipHostMalloc instead")));
-inline static hipError_t hipMallocHost(void** ptr, size_t size) {
-    return hipCUDAErrorTohipError(cudaMallocHost(ptr, size));
-}
-
-inline static hipError_t hipMemAllocHost(void** ptr, size_t size)
-    __attribute__((deprecated("use hipHostMalloc instead")));
-inline static hipError_t hipMemAllocHost(void** ptr, size_t size) {
-    return hipCUResultTohipError(cuMemAllocHost(ptr, size));
-}
-
-inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags)
-    __attribute__((deprecated("use hipHostMalloc instead")));
-inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
-}
-
-inline static hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
-}
-
-inline static hipError_t hipMemAdvise(const void* dev_ptr, size_t count, hipMemoryAdvise advice,
-                                      int device) {
-    return hipCUDAErrorTohipError(cudaMemAdvise(dev_ptr, count,
-        hipMemoryAdviseTocudaMemoryAdvise(advice), device));
-}
-
-inline static hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
-                                             hipStream_t stream __dparm(0)) {
-    return hipCUDAErrorTohipError(cudaMemPrefetchAsync(dev_ptr, count, device, stream));
-}
-
-inline static hipError_t hipMemRangeGetAttribute(void* data, size_t data_size,
-                                                 hipMemRangeAttribute attribute,
-                                                 const void* dev_ptr, size_t count) {
-    return hipCUDAErrorTohipError(cudaMemRangeGetAttribute(data, data_size,
-        hipMemRangeAttributeToCudaMemRangeAttribute(attribute), dev_ptr, count));
-}
-
-inline static hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
-                                                  hipMemRangeAttribute* attributes,
-                                                  size_t num_attributes, const void* dev_ptr,
-                                                  size_t count) {
-    return hipCUDAErrorTohipError(cudaMemRangeGetAttributes(data, data_sizes, attributes,
-        num_attributes, dev_ptr, count));
-}
-
-inline static hipError_t hipStreamAttachMemAsync(hipStream_t stream, hipDeviceptr_t* dev_ptr,
-                                                 size_t length __dparm(0),
-                                                 unsigned int flags __dparm(hipMemAttachSingle)) {
-    return hipCUDAErrorTohipError(cudaStreamAttachMemAsync(stream, dev_ptr, length, flags));
-}
-
-inline static hipError_t hipMallocManaged(void** ptr, size_t size, unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaMallocManaged(ptr, size, flags));
-}
-
-inline static hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
-                                        size_t width, size_t height,
-                                        unsigned int flags __dparm(hipArrayDefault)) {
-    return hipCUDAErrorTohipError(cudaMallocArray(array, desc, width, height, flags));
-}
-
-inline static hipError_t hipMalloc3DArray(hipArray** array, const hipChannelFormatDesc* desc,
-                             hipExtent extent, unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaMalloc3DArray(array, desc, extent, flags));
-}
-
-inline static hipError_t hipFreeArray(hipArray* array) {
-    return hipCUDAErrorTohipError(cudaFreeArray(array));
-}
-
-inline static hipError_t hipHostGetDevicePointer(void** devPtr, void* hostPtr, unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaHostGetDevicePointer(devPtr, hostPtr, flags));
-}
-
-inline static hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) {
-    return hipCUDAErrorTohipError(cudaHostGetFlags(flagsPtr, hostPtr));
-}
-
-inline static hipError_t hipHostRegister(void* ptr, size_t size, unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaHostRegister(ptr, size, flags));
-}
-
-inline static hipError_t hipHostUnregister(void* ptr) {
-    return hipCUDAErrorTohipError(cudaHostUnregister(ptr));
-}
-
-inline static hipError_t hipFreeHost(void* ptr)
-    __attribute__((deprecated("use hipHostFree instead")));
-inline static hipError_t hipFreeHost(void* ptr) {
-    return hipCUDAErrorTohipError(cudaFreeHost(ptr));
-}
-
-inline static hipError_t hipHostFree(void* ptr) {
-    return hipCUDAErrorTohipError(cudaFreeHost(ptr));
-}
-
-inline static hipError_t hipSetDevice(int device) {
-    return hipCUDAErrorTohipError(cudaSetDevice(device));
-}
-
-inline static hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop) {
-    struct cudaDeviceProp cdprop;
-    memset(&cdprop, 0x0, sizeof(struct cudaDeviceProp));
-    cdprop.major = prop->major;
-    cdprop.minor = prop->minor;
-    cdprop.totalGlobalMem = prop->totalGlobalMem;
-    cdprop.sharedMemPerBlock = prop->sharedMemPerBlock;
-    cdprop.regsPerBlock = prop->regsPerBlock;
-    cdprop.warpSize = prop->warpSize;
-    cdprop.maxThreadsPerBlock = prop->maxThreadsPerBlock;
-    cdprop.clockRate = prop->clockRate;
-    cdprop.totalConstMem = prop->totalConstMem;
-    cdprop.multiProcessorCount = prop->multiProcessorCount;
-    cdprop.l2CacheSize = prop->l2CacheSize;
-    cdprop.maxThreadsPerMultiProcessor = prop->maxThreadsPerMultiProcessor;
-    cdprop.computeMode = prop->computeMode;
-    cdprop.canMapHostMemory = prop->canMapHostMemory;
-    cdprop.memoryClockRate = prop->memoryClockRate;
-    cdprop.memoryBusWidth = prop->memoryBusWidth;
-    return hipCUDAErrorTohipError(cudaChooseDevice(device, &cdprop));
-}
-
-inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t size) {
-    return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size));
-}
-
-inline static hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t size) {
-    return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
-}
-
-inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t size) {
-    return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size));
-}
-
-inline static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t size,
-                                            hipStream_t stream) {
-    return hipCUResultTohipError(cuMemcpyHtoDAsync(dst, src, size, stream));
-}
-
-inline static hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t size,
-                                            hipStream_t stream) {
-    return hipCUResultTohipError(cuMemcpyDtoHAsync(dst, src, size, stream));
-}
-
-inline static hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t size,
-                                            hipStream_t stream) {
-    return hipCUResultTohipError(cuMemcpyDtoDAsync(dst, src, size, stream));
-}
-
-inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
-                                   hipMemcpyKind copyKind) {
-    return hipCUDAErrorTohipError(
-        cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
-}
-
-
-inline static hipError_t hipMemcpyWithStream(void* dst, const void* src,
-				      size_t sizeBytes, hipMemcpyKind copyKind,
-				      hipStream_t stream) {
-	cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes, 
-										hipMemcpyKindToCudaMemcpyKind(copyKind),
-										stream);
-	
-	if (error != cudaSuccess) return hipCUDAErrorTohipError(error);
-	
-	return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
-}
-
-inline static hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
-                                        hipMemcpyKind copyKind, hipStream_t stream __dparm(0)) {
-    return hipCUDAErrorTohipError(
-        cudaMemcpyAsync(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind), stream));
-}
-
-inline static hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeBytes,
-                                           size_t offset __dparm(0),
-                                           hipMemcpyKind copyType __dparm(hipMemcpyHostToDevice)) {
-    return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset,
-                                                     hipMemcpyKindToCudaMemcpyKind(copyType)));
-}
-
-inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src,
-                                                size_t sizeBytes, size_t offset,
-                                                hipMemcpyKind copyType,
-                                                hipStream_t stream __dparm(0)) {
-    return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(
-        symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType), stream));
-}
-
-inline static hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t sizeBytes,
-                                             size_t offset __dparm(0),
-                                             hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
-    return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset,
-                                                       hipMemcpyKindToCudaMemcpyKind(kind)));
-}
-
-inline static hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName,
-                                                  size_t sizeBytes, size_t offset,
-                                                  hipMemcpyKind kind,
-                                                  hipStream_t stream __dparm(0)) {
-    return hipCUDAErrorTohipError(cudaMemcpyFromSymbolAsync(
-        dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind), stream));
-}
-
-inline static hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
-    return hipCUDAErrorTohipError(cudaGetSymbolAddress(devPtr, symbolName));
-}
-
-inline static hipError_t hipGetSymbolSize(size_t* size, const void* symbolName) {
-    return hipCUDAErrorTohipError(cudaGetSymbolSize(size, symbolName));
-}
-
-inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
-                                     size_t width, size_t height, hipMemcpyKind kind) {
-    return hipCUDAErrorTohipError(
-        cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
-}
-
-inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
-  return hipCUResultTohipError(cuMemcpy2D(pCopy));
-}
-
-inline static hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0)) {
-  return hipCUResultTohipError(cuMemcpy2DAsync(pCopy, stream));
-}
-
-inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) {
-    return hipCUDAErrorTohipError(cudaMemcpy3D(p));
-}
-
-inline static hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms *p, hipStream_t stream) {
-    return hipCUDAErrorTohipError(cudaMemcpy3DAsync(p, stream));
-}
-
-inline static hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy) {
-    return hipCUResultTohipError(cuMemcpy3D(pCopy));
-}
-
-inline static hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) {
-    return hipCUResultTohipError(cuMemcpy3DAsync(pCopy, stream));
-}
-
-inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch,
-                                          size_t width, size_t height, hipMemcpyKind kind,
-                                          hipStream_t stream) {
-    return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height,
-                                                    hipMemcpyKindToCudaMemcpyKind(kind), stream));
-}
-
-inline static hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray* src,
-                                              size_t wOffset, size_t hOffset, size_t width,
-                                              size_t height, hipMemcpyKind kind) {
-    return hipCUDAErrorTohipError(cudaMemcpy2DFromArray(dst, dpitch, src, wOffset, hOffset, width,
-                                                        height,
-                                                        hipMemcpyKindToCudaMemcpyKind(kind)));
-}
-
-inline static hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray* src,
-                                                   size_t wOffset, size_t hOffset, size_t width,
-                                                   size_t height, hipMemcpyKind kind,
-                                                   hipStream_t stream) {
-    return hipCUDAErrorTohipError(cudaMemcpy2DFromArrayAsync(dst, dpitch, src, wOffset, hOffset,
-                                                             width, height,
-                                                             hipMemcpyKindToCudaMemcpyKind(kind),
-                                                             stream));
-}
-
-inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset,
-                                            const void* src, size_t spitch, size_t width,
-                                            size_t height, hipMemcpyKind kind) {
-    return hipCUDAErrorTohipError(cudaMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width,
-                                                      height, hipMemcpyKindToCudaMemcpyKind(kind)));
-}
-
-inline static hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset,
-                                                 const void* src, size_t spitch, size_t width,
-                                                 size_t height, hipMemcpyKind kind,
-                                                 hipStream_t stream) {
-    return hipCUDAErrorTohipError(cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, src, spitch,
-                                                           width, height,
-                                                           hipMemcpyKindToCudaMemcpyKind(kind),
-                                                           stream));
-}
-
-__HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset,
-                                                           size_t hOffset, const void* src,
-                                                           size_t count, hipMemcpyKind kind) {
-    return hipCUDAErrorTohipError(
-        cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind)));
-}
-
-__HIP_DEPRECATED inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray,
-                                                             size_t wOffset, size_t hOffset,
-                                                             size_t count, hipMemcpyKind kind) {
-    return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count,
-                                                      hipMemcpyKindToCudaMemcpyKind(kind)));
-}
-
-inline static hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset,
-                                       size_t count) {
-    return hipCUResultTohipError(cuMemcpyAtoH(dst, (CUarray)srcArray, srcOffset, count));
-}
-
-inline static hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost,
-                                       size_t count) {
-    return hipCUResultTohipError(cuMemcpyHtoA((CUarray)dstArray, dstOffset, srcHost, count));
-}
-
-inline static hipError_t hipDeviceSynchronize() {
-    return hipCUDAErrorTohipError(cudaDeviceSynchronize());
-}
-
-inline static hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* pCacheConfig) {
-    return hipCUDAErrorTohipError(cudaDeviceGetCacheConfig(pCacheConfig));
-}
-
-inline static hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value) {
-    return hipCUDAErrorTohipError(cudaFuncSetAttribute(func, attr, value));
-}
-
-inline static hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig) {
-    return hipCUDAErrorTohipError(cudaDeviceSetCacheConfig(cacheConfig));
-}
-
-inline static hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config) {
-    return hipCUDAErrorTohipError(cudaFuncSetSharedMemConfig(func, config));
-}
-
-inline static const char* hipGetErrorString(hipError_t error) {
-    return cudaGetErrorString(hipErrorToCudaError(error));
-}
-
-inline static const char* hipGetErrorName(hipError_t error) {
-    return cudaGetErrorName(hipErrorToCudaError(error));
-}
-
-inline static hipError_t hipGetDeviceCount(int* count) {
-    return hipCUDAErrorTohipError(cudaGetDeviceCount(count));
-}
-
-inline static hipError_t hipGetDevice(int* device) {
-    return hipCUDAErrorTohipError(cudaGetDevice(device));
-}
-
-inline static hipError_t hipIpcCloseMemHandle(void* devPtr) {
-    return hipCUDAErrorTohipError(cudaIpcCloseMemHandle(devPtr));
-}
-
-inline static hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event) {
-    return hipCUDAErrorTohipError(cudaIpcGetEventHandle(handle, event));
-}
-
-inline static hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr) {
-    return hipCUDAErrorTohipError(cudaIpcGetMemHandle(handle, devPtr));
-}
-
-inline static hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle) {
-    return hipCUDAErrorTohipError(cudaIpcOpenEventHandle(event, handle));
-}
-
-inline static hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle,
-                                             unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaIpcOpenMemHandle(devPtr, handle, flags));
-}
-
-inline static hipError_t hipMemset(void* devPtr, int value, size_t count) {
-    return hipCUDAErrorTohipError(cudaMemset(devPtr, value, count));
-}
-
-inline static hipError_t hipMemsetD32(hipDeviceptr_t devPtr, int value, size_t count) {
-    return hipCUResultTohipError(cuMemsetD32(devPtr, value, count));
-}
-
-inline static hipError_t hipMemsetAsync(void* devPtr, int value, size_t count,
-                                        hipStream_t stream __dparm(0)) {
-    return hipCUDAErrorTohipError(cudaMemsetAsync(devPtr, value, count, stream));
-}
-
-inline static hipError_t hipMemsetD32Async(hipDeviceptr_t devPtr, int value, size_t count,
-                                           hipStream_t stream __dparm(0)) {
-    return hipCUResultTohipError(cuMemsetD32Async(devPtr, value, count, stream));
-}
-
-inline static hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes) {
-    return hipCUResultTohipError(cuMemsetD8(dest, value, sizeBytes));
-}
-
-inline static hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes,
-                                          hipStream_t stream __dparm(0)) {
-    return hipCUResultTohipError(cuMemsetD8Async(dest, value, sizeBytes, stream));
-}
-
-inline static hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes) {
-    return hipCUResultTohipError(cuMemsetD16(dest, value, sizeBytes));
-}
-
-inline static hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes,
-                                           hipStream_t stream __dparm(0)) {
-    return hipCUResultTohipError(cuMemsetD16Async(dest, value, sizeBytes, stream));
-}
-
-inline static hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height) {
-    return hipCUDAErrorTohipError(cudaMemset2D(dst, pitch, value, width, height));
-}
-
-inline static hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height, hipStream_t stream __dparm(0)) {
-    return hipCUDAErrorTohipError(cudaMemset2DAsync(dst, pitch, value, width, height, stream));
-}
-
-inline static hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent ){
-    return hipCUDAErrorTohipError(cudaMemset3D(pitchedDevPtr, value, extent));
-}
-
-inline static hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent, hipStream_t stream __dparm(0) ){
-    return hipCUDAErrorTohipError(cudaMemset3DAsync(pitchedDevPtr, value, extent, stream));
-}
-
-inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int device) {
-    struct cudaDeviceProp cdprop;
-    cudaError_t cerror;
-    cerror = cudaGetDeviceProperties(&cdprop, device);
-
-    strncpy(p_prop->name, cdprop.name, 256);
-    p_prop->totalGlobalMem = cdprop.totalGlobalMem;
-    p_prop->sharedMemPerBlock = cdprop.sharedMemPerBlock;
-    p_prop->regsPerBlock = cdprop.regsPerBlock;
-    p_prop->warpSize = cdprop.warpSize;
-    p_prop->maxThreadsPerBlock = cdprop.maxThreadsPerBlock;
-    for (int i = 0; i < 3; i++) {
-        p_prop->maxThreadsDim[i] = cdprop.maxThreadsDim[i];
-        p_prop->maxGridSize[i] = cdprop.maxGridSize[i];
-    }
-    p_prop->clockRate = cdprop.clockRate;
-    p_prop->memoryClockRate = cdprop.memoryClockRate;
-    p_prop->memoryBusWidth = cdprop.memoryBusWidth;
-    p_prop->totalConstMem = cdprop.totalConstMem;
-    p_prop->major = cdprop.major;
-    p_prop->minor = cdprop.minor;
-    p_prop->multiProcessorCount = cdprop.multiProcessorCount;
-    p_prop->l2CacheSize = cdprop.l2CacheSize;
-    p_prop->maxThreadsPerMultiProcessor = cdprop.maxThreadsPerMultiProcessor;
-    p_prop->computeMode = cdprop.computeMode;
-    p_prop->clockInstructionRate = cdprop.clockRate; // Same as clock-rate:
-
-    int ccVers = p_prop->major * 100 + p_prop->minor * 10;
-    p_prop->arch.hasGlobalInt32Atomics = (ccVers >= 110);
-    p_prop->arch.hasGlobalFloatAtomicExch = (ccVers >= 110);
-    p_prop->arch.hasSharedInt32Atomics = (ccVers >= 120);
-    p_prop->arch.hasSharedFloatAtomicExch = (ccVers >= 120);
-    p_prop->arch.hasFloatAtomicAdd = (ccVers >= 200);
-    p_prop->arch.hasGlobalInt64Atomics = (ccVers >= 120);
-    p_prop->arch.hasSharedInt64Atomics = (ccVers >= 110);
-    p_prop->arch.hasDoubles = (ccVers >= 130);
-    p_prop->arch.hasWarpVote = (ccVers >= 120);
-    p_prop->arch.hasWarpBallot = (ccVers >= 200);
-    p_prop->arch.hasWarpShuffle = (ccVers >= 300);
-    p_prop->arch.hasFunnelShift = (ccVers >= 350);
-    p_prop->arch.hasThreadFenceSystem = (ccVers >= 200);
-    p_prop->arch.hasSyncThreadsExt = (ccVers >= 200);
-    p_prop->arch.hasSurfaceFuncs = (ccVers >= 200);
-    p_prop->arch.has3dGrid = (ccVers >= 200);
-    p_prop->arch.hasDynamicParallelism = (ccVers >= 350);
-
-    p_prop->concurrentKernels = cdprop.concurrentKernels;
-    p_prop->pciDomainID = cdprop.pciDomainID;
-    p_prop->pciBusID = cdprop.pciBusID;
-    p_prop->pciDeviceID = cdprop.pciDeviceID;
-    p_prop->maxSharedMemoryPerMultiProcessor = cdprop.sharedMemPerMultiprocessor;
-    p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard;
-    p_prop->canMapHostMemory = cdprop.canMapHostMemory;
-    p_prop->gcnArch = 0; // Not a GCN arch
-    p_prop->integrated = cdprop.integrated;
-    p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
-    p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
-    p_prop->cooperativeMultiDeviceUnmatchedFunc = 0;
-    p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0;
-    p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0;
-    p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0;
-
-    p_prop->maxTexture1D    = cdprop.maxTexture1D;
-    p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0];
-    p_prop->maxTexture2D[1] = cdprop.maxTexture2D[1];
-    p_prop->maxTexture3D[0] = cdprop.maxTexture3D[0];
-    p_prop->maxTexture3D[1] = cdprop.maxTexture3D[1];
-    p_prop->maxTexture3D[2] = cdprop.maxTexture3D[2];
-
-    p_prop->memPitch                 = cdprop.memPitch;
-    p_prop->textureAlignment         = cdprop.textureAlignment;
-    p_prop->texturePitchAlignment    = cdprop.texturePitchAlignment;
-    p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
-    p_prop->ECCEnabled               = cdprop.ECCEnabled;
-    p_prop->tccDriver                = cdprop.tccDriver;
-
-    return hipCUDAErrorTohipError(cerror);
-}
-
-inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) {
-    enum cudaDeviceAttr cdattr;
-    cudaError_t cerror;
-
-    switch (attr) {
-        case hipDeviceAttributeMaxThreadsPerBlock:
-            cdattr = cudaDevAttrMaxThreadsPerBlock;
-            break;
-        case hipDeviceAttributeMaxBlockDimX:
-            cdattr = cudaDevAttrMaxBlockDimX;
-            break;
-        case hipDeviceAttributeMaxBlockDimY:
-            cdattr = cudaDevAttrMaxBlockDimY;
-            break;
-        case hipDeviceAttributeMaxBlockDimZ:
-            cdattr = cudaDevAttrMaxBlockDimZ;
-            break;
-        case hipDeviceAttributeMaxGridDimX:
-            cdattr = cudaDevAttrMaxGridDimX;
-            break;
-        case hipDeviceAttributeMaxGridDimY:
-            cdattr = cudaDevAttrMaxGridDimY;
-            break;
-        case hipDeviceAttributeMaxGridDimZ:
-            cdattr = cudaDevAttrMaxGridDimZ;
-            break;
-        case hipDeviceAttributeMaxSharedMemoryPerBlock:
-            cdattr = cudaDevAttrMaxSharedMemoryPerBlock;
-            break;
-        case hipDeviceAttributeTotalConstantMemory:
-            cdattr = cudaDevAttrTotalConstantMemory;
-            break;
-        case hipDeviceAttributeWarpSize:
-            cdattr = cudaDevAttrWarpSize;
-            break;
-        case hipDeviceAttributeMaxRegistersPerBlock:
-            cdattr = cudaDevAttrMaxRegistersPerBlock;
-            break;
-        case hipDeviceAttributeClockRate:
-            cdattr = cudaDevAttrClockRate;
-            break;
-        case hipDeviceAttributeMemoryClockRate:
-            cdattr = cudaDevAttrMemoryClockRate;
-            break;
-        case hipDeviceAttributeMemoryBusWidth:
-            cdattr = cudaDevAttrGlobalMemoryBusWidth;
-            break;
-        case hipDeviceAttributeMultiprocessorCount:
-            cdattr = cudaDevAttrMultiProcessorCount;
-            break;
-        case hipDeviceAttributeComputeMode:
-            cdattr = cudaDevAttrComputeMode;
-            break;
-        case hipDeviceAttributeL2CacheSize:
-            cdattr = cudaDevAttrL2CacheSize;
-            break;
-        case hipDeviceAttributeMaxThreadsPerMultiProcessor:
-            cdattr = cudaDevAttrMaxThreadsPerMultiProcessor;
-            break;
-        case hipDeviceAttributeComputeCapabilityMajor:
-            cdattr = cudaDevAttrComputeCapabilityMajor;
-            break;
-        case hipDeviceAttributeComputeCapabilityMinor:
-            cdattr = cudaDevAttrComputeCapabilityMinor;
-            break;
-        case hipDeviceAttributeConcurrentKernels:
-            cdattr = cudaDevAttrConcurrentKernels;
-            break;
-        case hipDeviceAttributePciBusId:
-            cdattr = cudaDevAttrPciBusId;
-            break;
-        case hipDeviceAttributePciDeviceId:
-            cdattr = cudaDevAttrPciDeviceId;
-            break;
-        case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
-            cdattr = cudaDevAttrMaxSharedMemoryPerMultiprocessor;
-            break;
-        case hipDeviceAttributeIsMultiGpuBoard:
-            cdattr = cudaDevAttrIsMultiGpuBoard;
-            break;
-        case hipDeviceAttributeIntegrated:
-            cdattr = cudaDevAttrIntegrated;
-            break;
-        case hipDeviceAttributeMaxTexture1DWidth:
-            cdattr = cudaDevAttrMaxTexture1DWidth;
-            break;
-        case hipDeviceAttributeMaxTexture2DWidth:
-            cdattr = cudaDevAttrMaxTexture2DWidth;
-            break;
-        case hipDeviceAttributeMaxTexture2DHeight:
-            cdattr = cudaDevAttrMaxTexture2DHeight;
-            break;
-        case hipDeviceAttributeMaxTexture3DWidth:
-            cdattr = cudaDevAttrMaxTexture3DWidth;
-            break;
-        case hipDeviceAttributeMaxTexture3DHeight:
-            cdattr = cudaDevAttrMaxTexture3DHeight;
-            break;
-        case hipDeviceAttributeMaxTexture3DDepth:
-            cdattr = cudaDevAttrMaxTexture3DDepth;
-            break;
-        case hipDeviceAttributeMaxPitch:
-            cdattr = cudaDevAttrMaxPitch;
-            break;
-        case hipDeviceAttributeTextureAlignment:
-            cdattr = cudaDevAttrTextureAlignment;
-            break;
-        case hipDeviceAttributeTexturePitchAlignment:
-            cdattr = cudaDevAttrTexturePitchAlignment;
-            break;
-        case hipDeviceAttributeKernelExecTimeout:
-            cdattr = cudaDevAttrKernelExecTimeout;
-            break;
-        case hipDeviceAttributeCanMapHostMemory:
-            cdattr = cudaDevAttrCanMapHostMemory;
-            break;
-        case hipDeviceAttributeEccEnabled:
-            cdattr = cudaDevAttrEccEnabled;
-            break;
-        case hipDeviceAttributeCooperativeLaunch:
-            cdattr = cudaDevAttrCooperativeLaunch;
-            break;
-        case hipDeviceAttributeCooperativeMultiDeviceLaunch:
-            cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
-            break;
-        case hipDeviceAttributeConcurrentManagedAccess:
-            cdattr = cudaDevAttrConcurrentManagedAccess;
-            break;
-        case hipDeviceAttributeManagedMemory:
-            cdattr = cudaDevAttrManagedMemory;
-            break;
-        case hipDeviceAttributePageableMemoryAccessUsesHostPageTables:
-            cdattr = cudaDevAttrPageableMemoryAccessUsesHostPageTables;
-            break;
-        case hipDeviceAttributePageableMemoryAccess:
-            cdattr = cudaDevAttrPageableMemoryAccess;
-            break;
-        case hipDeviceAttributeDirectManagedMemAccessFromHost:
-            cdattr = cudaDevAttrDirectManagedMemAccessFromHost;
-            break;
-        default:
-            return hipCUDAErrorTohipError(cudaErrorInvalidValue);
-    }
-
-    cerror = cudaDeviceGetAttribute(pi, cdattr, device);
-
-    return hipCUDAErrorTohipError(cerror);
-}
-
-inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
-                                                                      const void* func,
-                                                                      int blockSize,
-                                                                      size_t dynamicSMemSize) {
-    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func,
-                                                              blockSize, dynamicSMemSize));
-}
-
-inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
-                                                                      const void* func,
-                                                                      int blockSize,
-                                                                      size_t dynamicSMemSize,
-                                                                      unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
-                                                      blockSize, dynamicSMemSize, flags));
-}
-
-inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, 
-                                                                 hipFunction_t f,
-                                                                 int  blockSize,
-                                                                 size_t dynamicSMemSize ){
-    return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f,
-                                                                   blockSize, dynamicSMemSize));
-}
-
-inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
-                                                                          hipFunction_t f,
-                                                                          int  blockSize,
-                                                                          size_t dynamicSMemSize,
-                                                                          unsigned int  flags ) {
-    return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks,f,
-                                                                blockSize, dynamicSMemSize, flags));
-}
-
-//TODO - Match CUoccupancyB2DSize
-inline static hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
-                                             hipFunction_t f, size_t dynSharedMemPerBlk,
-                                             int blockSizeLimit){
-    return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, NULL,
-                                 dynSharedMemPerBlk, blockSizeLimit));
-}
-
-//TODO - Match CUoccupancyB2DSize
-inline static hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
-                                             hipFunction_t f, size_t dynSharedMemPerBlk,
-                                             int blockSizeLimit, unsigned int  flags){
-    return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, f, NULL,
-                                 dynSharedMemPerBlk, blockSizeLimit, flags));
-}
-
-inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr) {
-    struct cudaPointerAttributes cPA;
-    hipError_t err = hipCUDAErrorTohipError(cudaPointerGetAttributes(&cPA, ptr));
-    if (err == hipSuccess) {
-#if (CUDART_VERSION >= 11000)
-        auto memType = cPA.type;
-#else
-        unsigned memType = cPA.memoryType; // No auto because cuda 10.2 doesnt force c++11
-#endif
-        switch (memType) {
-            case cudaMemoryTypeDevice:
-                attributes->memoryType = hipMemoryTypeDevice;
-                break;
-            case cudaMemoryTypeHost:
-                attributes->memoryType = hipMemoryTypeHost;
-                break;
-            default:
-                return hipErrorUnknown;
-        }
-        attributes->device = cPA.device;
-        attributes->devicePointer = cPA.devicePointer;
-        attributes->hostPointer = cPA.hostPointer;
-        attributes->isManaged = 0;
-        attributes->allocationFlags = 0;
-    }
-    return err;
-}
-
-inline static hipError_t hipMemGetInfo(size_t* free, size_t* total) {
-    return hipCUDAErrorTohipError(cudaMemGetInfo(free, total));
-}
-
-inline static hipError_t hipEventCreate(hipEvent_t* event) {
-    return hipCUDAErrorTohipError(cudaEventCreate(event));
-}
-
-inline static hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream __dparm(NULL)) {
-    return hipCUDAErrorTohipError(cudaEventRecord(event, stream));
-}
-
-inline static hipError_t hipEventSynchronize(hipEvent_t event) {
-    return hipCUDAErrorTohipError(cudaEventSynchronize(event));
-}
-
-inline static hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop) {
-    return hipCUDAErrorTohipError(cudaEventElapsedTime(ms, start, stop));
-}
-
-inline static hipError_t hipEventDestroy(hipEvent_t event) {
-    return hipCUDAErrorTohipError(cudaEventDestroy(event));
-}
-
-inline static hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaStreamCreateWithFlags(stream, flags));
-}
-
-inline static hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) {
-    return hipCUDAErrorTohipError(cudaStreamCreateWithPriority(stream, flags, priority));
-}
-
-inline static hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority) {
-    return hipCUDAErrorTohipError(cudaDeviceGetStreamPriorityRange(leastPriority, greatestPriority));
-}
-
-inline static hipError_t hipStreamCreate(hipStream_t* stream) {
-    return hipCUDAErrorTohipError(cudaStreamCreate(stream));
-}
-
-inline static hipError_t hipStreamSynchronize(hipStream_t stream) {
-    return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
-}
-
-inline static hipError_t hipStreamDestroy(hipStream_t stream) {
-    return hipCUDAErrorTohipError(cudaStreamDestroy(stream));
-}
-
-inline static hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags) {
-    return hipCUDAErrorTohipError(cudaStreamGetFlags(stream, flags));
-}
-
-inline static hipError_t hipStreamGetPriority(hipStream_t stream, int *priority) {
-    return hipCUDAErrorTohipError(cudaStreamGetPriority(stream, priority));
-}
-
-inline static hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event,
-                                            unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaStreamWaitEvent(stream, event, flags));
-}
-
-inline static hipError_t hipStreamQuery(hipStream_t stream) {
-    return hipCUDAErrorTohipError(cudaStreamQuery(stream));
-}
-
-inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback,
-                                              void* userData, unsigned int flags) {
-    return hipCUDAErrorTohipError(
-        cudaStreamAddCallback(stream, (cudaStreamCallback_t)callback, userData, flags));
-}
-
-inline static hipError_t hipDriverGetVersion(int* driverVersion) {
-    cudaError_t err = cudaDriverGetVersion(driverVersion);
-
-    // Override driver version to match version reported on HCC side.
-    *driverVersion = 4;
-
-    return hipCUDAErrorTohipError(err);
-}
-
-inline static hipError_t hipRuntimeGetVersion(int* runtimeVersion) {
-    return hipCUDAErrorTohipError(cudaRuntimeGetVersion(runtimeVersion));
-}
-
-inline static hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, int device, int peerDevice) {
-    return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice));
-}
-
-inline static hipError_t hipDeviceDisablePeerAccess(int peerDevice) {
-    return hipCUDAErrorTohipError(cudaDeviceDisablePeerAccess(peerDevice));
-}
-
-inline static hipError_t hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess(peerDevice, flags));
-}
-
-inline static hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx) {
-    return hipCUResultTohipError(cuCtxDisablePeerAccess(peerCtx));
-}
-
-inline static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags) {
-    return hipCUResultTohipError(cuCtxEnablePeerAccess(peerCtx, flags));
-}
-
-inline static hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags,
-                                                     int* active) {
-    return hipCUResultTohipError(cuDevicePrimaryCtxGetState(dev, flags, active));
-}
-
-inline static hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev) {
-    return hipCUResultTohipError(cuDevicePrimaryCtxRelease(dev));
-}
-
-inline static hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev) {
-    return hipCUResultTohipError(cuDevicePrimaryCtxRetain(pctx, dev));
-}
-
-inline static hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev) {
-    return hipCUResultTohipError(cuDevicePrimaryCtxReset(dev));
-}
-
-inline static hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags) {
-    return hipCUResultTohipError(cuDevicePrimaryCtxSetFlags(dev, flags));
-}
-
-inline static hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize,
-                                               hipDeviceptr_t dptr) {
-    return hipCUResultTohipError(cuMemGetAddressRange(pbase, psize, dptr));
-}
-
-inline static hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevice,
-                                       size_t count) {
-    return hipCUDAErrorTohipError(cudaMemcpyPeer(dst, dstDevice, src, srcDevice, count));
-}
-
-inline static hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
-                                            int srcDevice, size_t count,
-                                            hipStream_t stream __dparm(0)) {
-    return hipCUDAErrorTohipError(
-        cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream));
-}
-
-// Profile APIs:
-inline static hipError_t hipProfilerStart() { return hipCUDAErrorTohipError(cudaProfilerStart()); }
-
-inline static hipError_t hipProfilerStop() { return hipCUDAErrorTohipError(cudaProfilerStop()); }
-
-inline static hipError_t hipGetDeviceFlags(unsigned int* flags) {
-    return hipCUDAErrorTohipError(cudaGetDeviceFlags(flags));
-}
-
-inline static hipError_t hipSetDeviceFlags(unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaSetDeviceFlags(flags));
-}
-
-inline static hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaEventCreateWithFlags(event, flags));
-}
-
-inline static hipError_t hipEventQuery(hipEvent_t event) {
-    return hipCUDAErrorTohipError(cudaEventQuery(event));
-}
-
-inline static hipError_t hipCtxCreate(hipCtx_t* ctx, unsigned int flags, hipDevice_t device) {
-    return hipCUResultTohipError(cuCtxCreate(ctx, flags, device));
-}
-
-inline static hipError_t hipCtxDestroy(hipCtx_t ctx) {
-    return hipCUResultTohipError(cuCtxDestroy(ctx));
-}
-
-inline static hipError_t hipCtxPopCurrent(hipCtx_t* ctx) {
-    return hipCUResultTohipError(cuCtxPopCurrent(ctx));
-}
-
-inline static hipError_t hipCtxPushCurrent(hipCtx_t ctx) {
-    return hipCUResultTohipError(cuCtxPushCurrent(ctx));
-}
-
-inline static hipError_t hipCtxSetCurrent(hipCtx_t ctx) {
-    return hipCUResultTohipError(cuCtxSetCurrent(ctx));
-}
-
-inline static hipError_t hipCtxGetCurrent(hipCtx_t* ctx) {
-    return hipCUResultTohipError(cuCtxGetCurrent(ctx));
-}
-
-inline static hipError_t hipCtxGetDevice(hipDevice_t* device) {
-    return hipCUResultTohipError(cuCtxGetDevice(device));
-}
-
-inline static hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion) {
-    return hipCUResultTohipError(cuCtxGetApiVersion(ctx, (unsigned int*)apiVersion));
-}
-
-inline static hipError_t hipCtxGetCacheConfig(hipFuncCache* cacheConfig) {
-    return hipCUResultTohipError(cuCtxGetCacheConfig(cacheConfig));
-}
-
-inline static hipError_t hipCtxSetCacheConfig(hipFuncCache cacheConfig) {
-    return hipCUResultTohipError(cuCtxSetCacheConfig(cacheConfig));
-}
-
-inline static hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config) {
-    return hipCUResultTohipError(cuCtxSetSharedMemConfig((CUsharedconfig)config));
-}
-
-inline static hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig) {
-    return hipCUResultTohipError(cuCtxGetSharedMemConfig((CUsharedconfig*)pConfig));
-}
-
-inline static hipError_t hipCtxSynchronize(void) {
-    return hipCUResultTohipError(cuCtxSynchronize());
-}
-
-inline static hipError_t hipCtxGetFlags(unsigned int* flags) {
-    return hipCUResultTohipError(cuCtxGetFlags(flags));
-}
-
-inline static hipError_t hipCtxDetach(hipCtx_t ctx) {
-    return hipCUResultTohipError(cuCtxDetach(ctx));
-}
-
-inline static hipError_t hipDeviceGet(hipDevice_t* device, int ordinal) {
-    return hipCUResultTohipError(cuDeviceGet(device, ordinal));
-}
-
-inline static hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device) {
-    return hipCUResultTohipError(cuDeviceComputeCapability(major, minor, device));
-}
-
-inline static hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device) {
-    return hipCUResultTohipError(cuDeviceGetName(name, len, device));
-}
-
-inline static hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
-                                                  int srcDevice, int dstDevice) {
-    return hipCUDAErrorTohipError(cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice));
-}
-
-inline static hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, hipDevice_t device) {
-    return hipCUDAErrorTohipError(cudaDeviceGetPCIBusId(pciBusId, len, device));
-}
-
-inline static hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId) {
-    return hipCUDAErrorTohipError(cudaDeviceGetByPCIBusId(device, pciBusId));
-}
-
-inline static hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* config) {
-    return hipCUDAErrorTohipError(cudaDeviceGetSharedMemConfig(config));
-}
-
-inline static hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) {
-    return hipCUDAErrorTohipError(cudaDeviceSetSharedMemConfig(config));
-}
-
-inline static hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
-    return hipCUDAErrorTohipError(cudaDeviceGetLimit(pValue, limit));
-}
-
-inline static hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device) {
-    return hipCUResultTohipError(cuDeviceTotalMem(bytes, device));
-}
-
-inline static hipError_t hipModuleLoad(hipModule_t* module, const char* fname) {
-    return hipCUResultTohipError(cuModuleLoad(module, fname));
-}
-
-inline static hipError_t hipModuleUnload(hipModule_t hmod) {
-    return hipCUResultTohipError(cuModuleUnload(hmod));
-}
-
-inline static hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module,
-                                              const char* kname) {
-    return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
-}
-
-inline static hipError_t hipModuleGetTexRef(hipTexRef* pTexRef, hipModule_t hmod, const char* name){
-    hipCUResultTohipError(cuModuleGetTexRef(pTexRef, hmod, name));
-}
-
-inline static hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) {
-    return hipCUDAErrorTohipError(cudaFuncGetAttributes(attr, func));
-}
-
-inline static hipError_t hipFuncGetAttribute (int* value, hipFunction_attribute attrib, hipFunction_t hfunc) {
-    return hipCUResultTohipError(cuFuncGetAttribute(value, attrib, hfunc));
-}
-
-inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
-                                            const char* name) {
-    return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
-}
-
-inline static hipError_t hipModuleLoadData(hipModule_t* module, const void* image) {
-    return hipCUResultTohipError(cuModuleLoadData(module, image));
-}
-
-inline static hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image,
-                                             unsigned int numOptions, hipJitOption* options,
-                                             void** optionValues) {
-    return hipCUResultTohipError(
-        cuModuleLoadDataEx(module, image, numOptions, options, optionValues));
-}
-
-inline static hipError_t hipLaunchKernel(const void* function_address, dim3 numBlocks,
-					 dim3 dimBlocks, void** args, size_t sharedMemBytes,
-					 hipStream_t stream)
-{
-   return hipCUDAErrorTohipError(cudaLaunchKernel(function_address,numBlocks,dimBlocks,args,sharedMemBytes,stream));
-}
-
-inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
-                                               unsigned int gridDimY, unsigned int gridDimZ,
-                                               unsigned int blockDimX, unsigned int blockDimY,
-                                               unsigned int blockDimZ, unsigned int sharedMemBytes,
-                                               hipStream_t stream, void** kernelParams,
-                                               void** extra) {
-    return hipCUResultTohipError(cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX,
-                                                blockDimY, blockDimZ, sharedMemBytes, stream,
-                                                kernelParams, extra));
-}
-
-inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) {
-    return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
-}
-
-__HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset,
-                                                         struct textureReference* tex,
-                                                         const void* devPtr,
-                                                         const hipChannelFormatDesc* desc,
-                                                         size_t size __dparm(UINT_MAX)) {
-    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
-}
-
-__HIP_DEPRECATED inline static hipError_t hipBindTexture2D(
-    size_t* offset, struct textureReference* tex, const void* devPtr,
-    const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) {
-    return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
-}
-
-inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
-                                                        hipChannelFormatKind f) {
-    return cudaCreateChannelDesc(x, y, z, w, hipChannelFormatKindToCudaChannelFormatKind(f));
-}
-
-inline static hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
-                                                const hipResourceDesc* pResDesc,
-                                                const hipTextureDesc* pTexDesc,
-                                                const hipResourceViewDesc* pResViewDesc) {
-    return hipCUDAErrorTohipError(
-        cudaCreateTextureObject(pTexObject, pResDesc, pTexDesc, pResViewDesc));
-}
-
-inline static hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject) {
-    return hipCUDAErrorTohipError(cudaDestroyTextureObject(textureObject));
-}
-
-inline static hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject,
-                                                const hipResourceDesc* pResDesc) {
-    return hipCUDAErrorTohipError(cudaCreateSurfaceObject(pSurfObject, pResDesc));
-}
-
-inline static hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject) {
-    return hipCUDAErrorTohipError(cudaDestroySurfaceObject(surfaceObject));
-}
-
-inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc,
-                                           hipTextureObject_t textureObject) {
-    return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject));
-}
-
-__HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
-    size_t* offset, const struct textureReference* texref) {
-    return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));
-}
-
-inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
-{
-    return hipCUDAErrorTohipError(cudaGetChannelDesc(desc,array));
-}
-
-inline static hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
-                                      void** kernelParams, unsigned int sharedMemBytes,
-                                      hipStream_t stream) {
-    return hipCUDAErrorTohipError(
-            cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
-}
-
-inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
-                                                 int  numDevices, unsigned int  flags) {
-    return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#ifdef __CUDACC__
-
-template<class T>
-inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
-                                                                      T func,
-                                                                      int blockSize,
-                                                                      size_t dynamicSMemSize) {
-    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func,
-                                                            blockSize, dynamicSMemSize));
-}
-
-template <class T>
-inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, T func,
-                                                           size_t dynamicSMemSize = 0,
-                                                           int blockSizeLimit = 0) {
-    return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func,
-                                                           dynamicSMemSize, blockSizeLimit));
-}
-
-template <class T>
-inline static hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* minGridSize, int* blockSize, T func,
-                                                           size_t dynamicSMemSize = 0,
-                                                           int blockSizeLimit = 0, unsigned int  flags = 0) {
-    return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func,
-                                                           dynamicSMemSize, blockSizeLimit, flags));
-}
-
-template <class T>
-inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int* numBlocks, T func,
-                                              int  blockSize, size_t dynamicSMemSize,unsigned int flags) {
-    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
-                                                                 blockSize, dynamicSMemSize, flags));
-}
-
-template <class T, int dim, enum cudaTextureReadMode readMode>
-inline static hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
-                                        const void* devPtr, size_t size = UINT_MAX) {
-    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, size));
-}
-
-template <class T, int dim, enum cudaTextureReadMode readMode>
-inline static hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex,
-                                        const void* devPtr, const hipChannelFormatDesc& desc,
-                                        size_t size = UINT_MAX) {
-    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
-}
-
-template <class T, int dim, enum cudaTextureReadMode readMode>
-__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>* tex) {
-    return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
-}
-
-template <class T, int dim, enum cudaTextureReadMode readMode>
-__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>& tex) {
-    return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
-}
-
-template <class T, int dim, enum cudaTextureReadMode readMode>
-__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
-    struct texture<T, dim, readMode>& tex, hipArray_const_t array,
-    const hipChannelFormatDesc& desc) {
-    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
-}
-
-template <class T, int dim, enum cudaTextureReadMode readMode>
-__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
-    struct texture<T, dim, readMode>* tex, hipArray_const_t array,
-    const hipChannelFormatDesc* desc) {
-    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
-}
-
-template <class T, int dim, enum cudaTextureReadMode readMode>
-__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
-    struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
-    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array));
-}
-
-template <class T>
-inline static hipChannelFormatDesc hipCreateChannelDesc() {
-    return cudaCreateChannelDesc<T>();
-}
-
-template <class T>
-inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
-                                             void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
-    return hipCUDAErrorTohipError(
-            cudaLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim, blockDim, kernelParams, sharedMemBytes, stream));
-}
-
-inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipAddress_mode am){
-    return hipCUResultTohipError(cuTexRefSetAddressMode(hTexRef,dim,am));
-}
-
-inline static hipError_t hipTexRefSetFilterMode(hipTexRef hTexRef, hipFilter_mode fm){
-    return hipCUResultTohipError(cuTexRefSetFilterMode(hTexRef,fm));
-}
-
-inline static hipError_t hipTexRefSetAddress(size_t *ByteOffset, hipTexRef hTexRef, hipDeviceptr_t dptr, size_t bytes){
-   return hipCUResultTohipError(cuTexRefSetAddress(ByteOffset,hTexRef,dptr,bytes));
-}
-
-inline static hipError_t hipTexRefSetAddress2D(hipTexRef hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, hipDeviceptr_t dptr, size_t Pitch){
-   return hipCUResultTohipError(cuTexRefSetAddress2D(hTexRef,desc,dptr,Pitch));
-}
-
-inline static hipError_t hipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents){
-   return hipCUResultTohipError(cuTexRefSetFormat(hTexRef,fmt,NumPackedComponents));
-}
-
-inline static hipError_t hipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags){
-   return hipCUResultTohipError(cuTexRefSetFlags(hTexRef,Flags));
-}
-
-inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, hiparray hArray, unsigned int Flags){
-   return hipCUResultTohipError(cuTexRefSetArray(hTexRef,hArray,Flags));
-}
-
-inline static hipError_t hipArrayCreate(hiparray* pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray){
-   return hipCUResultTohipError(cuArrayCreate(pHandle, pAllocateArray));
-}
-
-inline static hipError_t hipArrayDestroy(hiparray hArray){
-   return hipCUResultTohipError(cuArrayDestroy(hArray));
-}
-
-inline static hipError_t hipArray3DCreate(hiparray* pHandle,
-                                          const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray){
-   return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
-}
-
-#endif  //__CUDACC__
-
-#endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
diff --git a/hipnv/include/hip/nvidia_detail/hip_texture_types.h b/hipnv/include/hip/nvidia_detail/hip_texture_types.h
deleted file mode 100644
index df374d705a..0000000000
--- a/hipnv/include/hip/nvidia_detail/hip_texture_types.h
+++ /dev/null
@@ -1,6 +0,0 @@
-#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
-#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
-
-#include <texture_types.h>
-
-#endif
diff --git a/hipnv/include/hip/nvidia_detail/hiprtc.h b/hipnv/include/hip/nvidia_detail/hiprtc.h
deleted file mode 100644
index 449ba26c0f..0000000000
--- a/hipnv/include/hip/nvidia_detail/hiprtc.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
-Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
-*/
-#ifndef HIPRTC_H
-#define HIPRTC_H
-
-#include <cuda.h>
-#include <nvrtc.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif /* __cplusplus */
-
-#include <stdlib.h>
-
-#if !defined(_WIN32)
-#pragma GCC visibility push(default)
-#endif
-
-typedef enum hiprtcResult {
-  HIPRTC_SUCCESS = 0,
-  HIPRTC_ERROR_OUT_OF_MEMORY = 1,
-  HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
-  HIPRTC_ERROR_INVALID_INPUT = 3,
-  HIPRTC_ERROR_INVALID_PROGRAM = 4,
-  HIPRTC_ERROR_INVALID_OPTION = 5,
-  HIPRTC_ERROR_COMPILATION = 6,
-  HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
-  HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
-  HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
-  HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
-  HIPRTC_ERROR_INTERNAL_ERROR = 11
-} hiprtcResult;
-
-inline static nvrtcResult hiprtcResultTonvrtcResult(hiprtcResult result) {
-  switch (result) {
-    case HIPRTC_SUCCESS:
-      return NVRTC_SUCCESS;
-    case HIPRTC_ERROR_OUT_OF_MEMORY:
-      return NVRTC_ERROR_OUT_OF_MEMORY;
-    case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE:
-      return NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
-    case HIPRTC_ERROR_INVALID_INPUT:
-      return NVRTC_ERROR_INVALID_INPUT;
-    case HIPRTC_ERROR_INVALID_PROGRAM:
-      return NVRTC_ERROR_INVALID_PROGRAM;
-    case HIPRTC_ERROR_INVALID_OPTION:
-      return NVRTC_ERROR_INVALID_OPTION;
-    case HIPRTC_ERROR_COMPILATION:
-      return NVRTC_ERROR_COMPILATION;
-    case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE:
-      return NVRTC_ERROR_BUILTIN_OPERATION_FAILURE;
-    case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
-      return NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
-    case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
-      return NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
-    case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
-      return NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
-    case HIPRTC_ERROR_INTERNAL_ERROR:
-      return NVRTC_ERROR_INTERNAL_ERROR;
-  }
-}
-
-inline static hiprtcResult nvrtcResultTohiprtcResult(nvrtcResult result) {
-  switch (result) {
-    case NVRTC_SUCCESS:
-      return HIPRTC_SUCCESS;
-    case NVRTC_ERROR_OUT_OF_MEMORY:
-      return HIPRTC_ERROR_OUT_OF_MEMORY;
-    case NVRTC_ERROR_PROGRAM_CREATION_FAILURE:
-      return HIPRTC_ERROR_PROGRAM_CREATION_FAILURE;
-    case NVRTC_ERROR_INVALID_INPUT:
-      return HIPRTC_ERROR_INVALID_INPUT;
-    case NVRTC_ERROR_INVALID_PROGRAM:
-      return HIPRTC_ERROR_INVALID_PROGRAM;
-    case NVRTC_ERROR_INVALID_OPTION:
-      return HIPRTC_ERROR_INVALID_OPTION;
-    case NVRTC_ERROR_COMPILATION:
-      return HIPRTC_ERROR_COMPILATION;
-    case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE:
-      return HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE;
-    case NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
-      return HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
-    case NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
-      return HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
-    case NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
-      return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
-    case NVRTC_ERROR_INTERNAL_ERROR:
-      return HIPRTC_ERROR_INTERNAL_ERROR;
-  }
-}
-
-const char* hiprtcGetErrorString(hiprtcResult result) {
-  return nvrtcGetErrorString(hiprtcResultTonvrtcResult(result));
-}
-
-hiprtcResult hiprtcVersion(int* major, int* minor) {
-  return nvrtcResultTohiprtcResult(nvrtcVersion(major, minor));
-}
-
-typedef nvrtcProgram hiprtcProgram;
-
-hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression) {
-  return nvrtcResultTohiprtcResult(nvrtcAddNameExpression(prog, name_expression));
-}
-
-hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions, const char** options) {
-  return nvrtcResultTohiprtcResult(nvrtcCompileProgram(prog, numOptions, options));
-}
-
-hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src, const char* name,
-                                 int numHeaders, const char** headers, const char** includeNames) {
-  return nvrtcResultTohiprtcResult(
-      nvrtcCreateProgram(prog, src, name, numHeaders, headers, includeNames));
-}
-
-hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog) {
-  return nvrtcResultTohiprtcResult(nvrtcDestroyProgram(prog));
-}
-
-hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog, const char* name_expression,
-                                  const char** lowered_name) {
-  return nvrtcResultTohiprtcResult(nvrtcGetLoweredName(prog, name_expression, lowered_name));
-}
-
-hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log) {
-  return nvrtcResultTohiprtcResult(nvrtcGetProgramLog(prog, log));
-}
-
-hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet) {
-  return nvrtcResultTohiprtcResult(nvrtcGetProgramLogSize(prog, logSizeRet));
-}
-
-hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code) {
-  return nvrtcResultTohiprtcResult(nvrtcGetPTX(prog, code));
-}
-
-hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet) {
-  return nvrtcResultTohiprtcResult(nvrtcGetPTXSize(prog, codeSizeRet));
-}
-
-#if !defined(_WIN32)
-#pragma GCC visibility pop
-#endif
-
-#ifdef __cplusplus
-}
-#endif /* __cplusplus */
-
-#endif  // HIPRTC_H

From b2f4e62135de6452d3b4817e22458be3678683bd Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Mon, 7 Jun 2021 21:42:44 +0000
Subject: [PATCH 012/177] ROCMOPS-1956 - Push restructured code to hipamd

hipamd will have AMD's ROCCLR based HIP backend implementation

Change-Id: Id7de9634519b4ce46fca71a1b61f3d5b1e3fc459
---
 .../nvidia_detail/nvidia_channel_descriptor.h |   28 +
 .../hip/nvidia_detail/nvidia_hip_complex.h    |  119 +
 .../nvidia_hip_cooperative_groups.h           |   12 +
 .../hip/nvidia_detail/nvidia_hip_runtime.h    |  122 +
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 2195 +++++++++++++++++
 .../nvidia_detail/nvidia_hip_texture_types.h  |    6 +
 .../include/hip/nvidia_detail/nvidia_hiprtc.h |  168 ++
 7 files changed, 2650 insertions(+)
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_channel_descriptor.h
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hip_cooperative_groups.h
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hip_texture_types.h
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_channel_descriptor.h b/hipnv/include/hip/nvidia_detail/nvidia_channel_descriptor.h
new file mode 100644
index 0000000000..7eb0e65fda
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_channel_descriptor.h
@@ -0,0 +1,28 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_CHANNEL_DESCRIPTOR_H
+
+#include "channel_descriptor.h"
+
+#endif
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h
new file mode 100644
index 0000000000..10a53d1743
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h
@@ -0,0 +1,119 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COMPLEX_H
+
+#include "cuComplex.h"
+
+typedef cuFloatComplex hipFloatComplex;
+
+__device__ __host__ static inline float hipCrealf(hipFloatComplex z) { return cuCrealf(z); }
+
+__device__ __host__ static inline float hipCimagf(hipFloatComplex z) { return cuCimagf(z); }
+
+__device__ __host__ static inline hipFloatComplex make_hipFloatComplex(float a, float b) {
+    return make_cuFloatComplex(a, b);
+}
+
+__device__ __host__ static inline hipFloatComplex hipConjf(hipFloatComplex z) { return cuConjf(z); }
+
+__device__ __host__ static inline float hipCsqabsf(hipFloatComplex z) {
+    return cuCabsf(z) * cuCabsf(z);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCaddf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCaddf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCsubf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCsubf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCmulf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCmulf(p, q);
+}
+
+__device__ __host__ static inline hipFloatComplex hipCdivf(hipFloatComplex p, hipFloatComplex q) {
+    return cuCdivf(p, q);
+}
+
+__device__ __host__ static inline float hipCabsf(hipFloatComplex z) { return cuCabsf(z); }
+
+typedef cuDoubleComplex hipDoubleComplex;
+
+__device__ __host__ static inline double hipCreal(hipDoubleComplex z) { return cuCreal(z); }
+
+__device__ __host__ static inline double hipCimag(hipDoubleComplex z) { return cuCimag(z); }
+
+__device__ __host__ static inline hipDoubleComplex make_hipDoubleComplex(double a, double b) {
+    return make_cuDoubleComplex(a, b);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipConj(hipDoubleComplex z) { return cuConj(z); }
+
+__device__ __host__ static inline double hipCsqabs(hipDoubleComplex z) {
+    return cuCabs(z) * cuCabs(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCadd(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCadd(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCsub(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCsub(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCmul(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCmul(p, q);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCdiv(hipDoubleComplex p, hipDoubleComplex q) {
+    return cuCdiv(p, q);
+}
+
+__device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return cuCabs(z); }
+
+typedef cuFloatComplex hipComplex;
+
+__device__ __host__ static inline hipComplex make_Complex(float x, float y) {
+    return make_cuComplex(x, y);
+}
+
+__device__ __host__ static inline hipFloatComplex hipComplexDoubleToFloat(hipDoubleComplex z) {
+    return cuComplexDoubleToFloat(z);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipComplexFloatToDouble(hipFloatComplex z) {
+    return cuComplexFloatToDouble(z);
+}
+
+__device__ __host__ static inline hipComplex hipCfmaf(hipComplex p, hipComplex q, hipComplex r) {
+    return cuCfmaf(p, q, r);
+}
+
+__device__ __host__ static inline hipDoubleComplex hipCfma(hipDoubleComplex p, hipDoubleComplex q,
+                                                           hipDoubleComplex r) {
+    return cuCfma(p, q, r);
+}
+
+#endif
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_cooperative_groups.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_cooperative_groups.h
new file mode 100644
index 0000000000..fc98ae2281
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_cooperative_groups.h
@@ -0,0 +1,12 @@
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
+
+// Include CUDA headers
+#include <cuda_runtime.h>
+#include <cooperative_groups.h>
+
+// Include HIP wrapper headers around CUDA
+#include <hip/hip_runtime.h>
+#include <hip/hip_runtime_api.h>
+
+#endif // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_COOPERATIVE_GROUPS_H
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
new file mode 100644
index 0000000000..a42fecc611
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
@@ -0,0 +1,122 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_H
+
+#include <cuda_runtime.h>
+
+#include <hip/hip_runtime_api.h>
+
+#define HIP_KERNEL_NAME(...) __VA_ARGS__
+
+typedef int hipLaunchParm;
+
+#define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
+    do {                                                                                           \
+        kernelName<<<numBlocks, numThreads, memPerBlock, streamId>>>(__VA_ARGS__);                 \
+    } while (0)
+
+#define hipLaunchKernelGGL(kernelName, ...)  hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+
+#define hipReadModeElementType cudaReadModeElementType
+
+#ifdef __CUDA_ARCH__
+
+
+// 32-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 110)
+#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (__CUDA_ARCH__ >= 200)
+
+// 64-bit Atomics:
+#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SHARED_INT64_ATOMICS__ (__CUDA_ARCH__ >= 120)
+
+// Doubles
+#define __HIP_ARCH_HAS_DOUBLES__ (__CUDA_ARCH__ >= 120)
+
+// warp cross-lane operations:
+#define __HIP_ARCH_HAS_WARP_VOTE__ (__CUDA_ARCH__ >= 120)
+#define __HIP_ARCH_HAS_WARP_BALLOT__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_WARP_SHUFFLE__ (__CUDA_ARCH__ >= 300)
+#define __HIP_ARCH_HAS_WARP_FUNNEL_SHIFT__ (__CUDA_ARCH__ >= 350)
+
+// sync
+#define __HIP_ARCH_HAS_THREAD_FENCE_SYSTEM__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_SYNC_THREAD_EXT__ (__CUDA_ARCH__ >= 200)
+
+// misc
+#define __HIP_ARCH_HAS_SURFACE_FUNCS__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_3DGRID__ (__CUDA_ARCH__ >= 200)
+#define __HIP_ARCH_HAS_DYNAMIC_PARALLEL__ (__CUDA_ARCH__ >= 350)
+
+#endif
+
+#ifdef __CUDACC__
+
+
+#define hipThreadIdx_x threadIdx.x
+#define hipThreadIdx_y threadIdx.y
+#define hipThreadIdx_z threadIdx.z
+
+#define hipBlockIdx_x blockIdx.x
+#define hipBlockIdx_y blockIdx.y
+#define hipBlockIdx_z blockIdx.z
+
+#define hipBlockDim_x blockDim.x
+#define hipBlockDim_y blockDim.y
+#define hipBlockDim_z blockDim.z
+
+#define hipGridDim_x gridDim.x
+#define hipGridDim_y gridDim.y
+#define hipGridDim_z gridDim.z
+
+#define HIP_SYMBOL(X) &X
+
+/**
+ * Map HIP_DYNAMIC_SHARED to "extern __shared__" for compatibility with old HIP applications
+ * To be removed in a future release.
+ */
+#define HIP_DYNAMIC_SHARED(type, var) extern __shared__ type var[];
+#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+
+#ifdef __HIP_DEVICE_COMPILE__
+#define abort_()                                                                                    \
+    { asm("trap;"); }
+#undef assert
+#define assert(COND)                                                                               \
+    {                                                                                              \
+        if (!COND) {                                                                               \
+            abort_();                                                                               \
+        }                                                                                          \
+    }
+#endif
+
+#define __clock() clock()
+#define __clock64() clock64()
+
+#endif
+
+#endif
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
new file mode 100644
index 0000000000..66e4743abd
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -0,0 +1,2195 @@
+/*
+Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
+
+#include <cuda_runtime_api.h>
+#include <cuda.h>
+#include <cuda_profiler_api.h>
+#include <cuda_fp16.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __cplusplus
+#define __dparm(x) = x
+#else
+#define __dparm(x)
+#endif
+
+// Add Deprecated Support for CUDA Mapped HIP APIs
+#if defined(__DOXYGEN_ONLY__) || defined(HIP_ENABLE_DEPRECATED)
+#define __HIP_DEPRECATED
+#elif defined(_MSC_VER)
+#define __HIP_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __HIP_DEPRECATED __attribute__((deprecated))
+#else
+#define __HIP_DEPRECATED
+#endif
+
+
+// TODO -move to include/hip_runtime_api.h as a common implementation.
+/**
+ * Memory copy types
+ *
+ */
+typedef enum hipMemcpyKind {
+    hipMemcpyHostToHost,
+    hipMemcpyHostToDevice,
+    hipMemcpyDeviceToHost,
+    hipMemcpyDeviceToDevice,
+    hipMemcpyDefault
+} hipMemcpyKind;
+
+typedef enum hipMemoryAdvise {
+    hipMemAdviseSetReadMostly,
+    hipMemAdviseUnsetReadMostly,
+    hipMemAdviseSetPreferredLocation,
+    hipMemAdviseUnsetPreferredLocation,
+    hipMemAdviseSetAccessedBy,
+    hipMemAdviseUnsetAccessedBy
+} hipMemoryAdvise;
+
+// hipDataType
+#define hipDataType cudaDataType
+#define HIP_R_16F CUDA_R_16F
+#define HIP_R_32F CUDA_R_32F
+#define HIP_R_64F CUDA_R_64F
+#define HIP_C_16F CUDA_C_16F
+#define HIP_C_32F CUDA_C_32F
+#define HIP_C_64F CUDA_C_64F
+
+// hipLibraryPropertyType
+#define hipLibraryPropertyType libraryPropertyType
+#define HIP_LIBRARY_MAJOR_VERSION MAJOR_VERSION
+#define HIP_LIBRARY_MINOR_VERSION MINOR_VERSION
+#define HIP_LIBRARY_PATCH_LEVEL PATCH_LEVEL
+
+#define HIP_ARRAY_DESCRIPTOR CUDA_ARRAY_DESCRIPTOR
+#define HIP_ARRAY3D_DESCRIPTOR CUDA_ARRAY3D_DESCRIPTOR
+
+//hipArray_Format
+#define HIP_AD_FORMAT_UNSIGNED_INT8   CU_AD_FORMAT_UNSIGNED_INT8
+#define HIP_AD_FORMAT_UNSIGNED_INT16  CU_AD_FORMAT_UNSIGNED_INT16
+#define HIP_AD_FORMAT_UNSIGNED_INT32  CU_AD_FORMAT_UNSIGNED_INT32
+#define HIP_AD_FORMAT_SIGNED_INT8     CU_AD_FORMAT_SIGNED_INT8
+#define HIP_AD_FORMAT_SIGNED_INT16    CU_AD_FORMAT_SIGNED_INT16
+#define HIP_AD_FORMAT_SIGNED_INT32    CU_AD_FORMAT_SIGNED_INT32
+#define HIP_AD_FORMAT_HALF            CU_AD_FORMAT_HALF
+#define HIP_AD_FORMAT_FLOAT           CU_AD_FORMAT_FLOAT
+
+// hipArray_Format
+#define hipArray_Format CUarray_format
+
+inline static CUarray_format hipArray_FormatToCUarray_format(
+    hipArray_Format format) {
+    switch (format) {
+        case HIP_AD_FORMAT_UNSIGNED_INT8:
+            return CU_AD_FORMAT_UNSIGNED_INT8;
+        case HIP_AD_FORMAT_UNSIGNED_INT16:
+            return CU_AD_FORMAT_UNSIGNED_INT16;
+        case HIP_AD_FORMAT_UNSIGNED_INT32:
+            return CU_AD_FORMAT_UNSIGNED_INT32;
+        case HIP_AD_FORMAT_SIGNED_INT8:
+            return CU_AD_FORMAT_SIGNED_INT8;
+        case HIP_AD_FORMAT_SIGNED_INT16:
+            return CU_AD_FORMAT_SIGNED_INT16;
+        case HIP_AD_FORMAT_SIGNED_INT32:
+            return CU_AD_FORMAT_SIGNED_INT32;
+        case HIP_AD_FORMAT_HALF:
+            return CU_AD_FORMAT_HALF;
+        case HIP_AD_FORMAT_FLOAT:
+            return CU_AD_FORMAT_FLOAT;
+        default:
+            return CU_AD_FORMAT_UNSIGNED_INT8;
+    }
+}
+
+#define HIP_TR_ADDRESS_MODE_WRAP   CU_TR_ADDRESS_MODE_WRAP
+#define HIP_TR_ADDRESS_MODE_CLAMP  CU_TR_ADDRESS_MODE_CLAMP
+#define HIP_TR_ADDRESS_MODE_MIRROR CU_TR_ADDRESS_MODE_MIRROR
+#define HIP_TR_ADDRESS_MODE_BORDER CU_TR_ADDRESS_MODE_BORDER
+
+// hipAddress_mode
+#define hipAddress_mode CUaddress_mode
+
+inline static CUaddress_mode hipAddress_modeToCUaddress_mode(
+    hipAddress_mode mode) {
+    switch (mode) {
+        case HIP_TR_ADDRESS_MODE_WRAP:
+            return CU_TR_ADDRESS_MODE_WRAP;
+        case HIP_TR_ADDRESS_MODE_CLAMP:
+            return CU_TR_ADDRESS_MODE_CLAMP;
+        case HIP_TR_ADDRESS_MODE_MIRROR:
+            return CU_TR_ADDRESS_MODE_MIRROR;
+        case HIP_TR_ADDRESS_MODE_BORDER:
+            return CU_TR_ADDRESS_MODE_BORDER;
+        default:
+            return CU_TR_ADDRESS_MODE_WRAP;
+    }
+}
+
+#define HIP_TR_FILTER_MODE_POINT   CU_TR_FILTER_MODE_POINT
+#define HIP_TR_FILTER_MODE_LINEAR  CU_TR_FILTER_MODE_LINEAR
+
+// hipFilter_mode
+#define hipFilter_mode CUfilter_mode
+
+inline static CUfilter_mode hipFilter_mode_enumToCUfilter_mode(
+    hipFilter_mode mode) {
+    switch (mode) {
+        case HIP_TR_FILTER_MODE_POINT:
+            return CU_TR_FILTER_MODE_POINT;
+        case HIP_TR_FILTER_MODE_LINEAR:
+            return CU_TR_FILTER_MODE_LINEAR;
+        default:
+            return CU_TR_FILTER_MODE_POINT;
+    }
+}
+
+//hipResourcetype
+#define HIP_RESOURCE_TYPE_ARRAY            CU_RESOURCE_TYPE_ARRAY
+#define HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY  CU_RESOURCE_TYPE_MIPMAPPED_ARRAY
+#define HIP_RESOURCE_TYPE_LINEAR           CU_RESOURCE_TYPE_LINEAR
+#define HIP_RESOURCE_TYPE_PITCH2D          CU_RESOURCE_TYPE_PITCH2D
+
+// hipResourcetype
+#define hipResourcetype CUresourcetype
+
+inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
+    hipResourcetype resType) {
+    switch (resType) {
+        case HIP_RESOURCE_TYPE_ARRAY:
+            return CU_RESOURCE_TYPE_ARRAY;
+        case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
+            return CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+        case HIP_RESOURCE_TYPE_LINEAR:
+            return CU_RESOURCE_TYPE_LINEAR;
+        case HIP_RESOURCE_TYPE_PITCH2D:
+            return CU_RESOURCE_TYPE_PITCH2D;
+        default:
+            return CU_RESOURCE_TYPE_ARRAY;
+    }
+}
+
+#define hipTexRef CUtexref
+#define hiparray CUarray
+
+// hipTextureAddressMode
+typedef enum cudaTextureAddressMode hipTextureAddressMode;
+#define hipAddressModeWrap cudaAddressModeWrap
+#define hipAddressModeClamp cudaAddressModeClamp
+#define hipAddressModeMirror cudaAddressModeMirror
+#define hipAddressModeBorder cudaAddressModeBorder
+
+// hipTextureFilterMode
+typedef enum cudaTextureFilterMode hipTextureFilterMode;
+#define hipFilterModePoint cudaFilterModePoint
+#define hipFilterModeLinear cudaFilterModeLinear
+
+// hipTextureReadMode
+typedef enum cudaTextureReadMode hipTextureReadMode;
+#define hipReadModeElementType cudaReadModeElementType
+#define hipReadModeNormalizedFloat cudaReadModeNormalizedFloat
+
+// hipChannelFormatKind
+typedef enum cudaChannelFormatKind hipChannelFormatKind;
+#define hipChannelFormatKindSigned      cudaChannelFormatKindSigned
+#define hipChannelFormatKindUnsigned    cudaChannelFormatKindUnsigned
+#define hipChannelFormatKindFloat       cudaChannelFormatKindFloat
+#define hipChannelFormatKindNone        cudaChannelFormatKindNone
+
+// hipMemRangeAttribute
+typedef enum cudaMemRangeAttribute hipMemRangeAttribute;
+#define hipMemRangeAttributeReadMostly cudaMemRangeAttributeReadMostly
+#define hipMemRangeAttributePreferredLocation cudaMemRangeAttributePreferredLocation
+#define hipMemRangeAttributeAccessedBy cudaMemRangeAttributeAccessedBy
+#define hipMemRangeAttributeLastPrefetchLocation cudaMemRangeAttributeLastPrefetchLocation
+
+#define hipSurfaceBoundaryMode cudaSurfaceBoundaryMode
+#define hipBoundaryModeZero cudaBoundaryModeZero
+#define hipBoundaryModeTrap cudaBoundaryModeTrap
+#define hipBoundaryModeClamp cudaBoundaryModeClamp
+
+// hipFuncCache
+#define hipFuncCachePreferNone cudaFuncCachePreferNone
+#define hipFuncCachePreferShared cudaFuncCachePreferShared
+#define hipFuncCachePreferL1 cudaFuncCachePreferL1
+#define hipFuncCachePreferEqual cudaFuncCachePreferEqual
+
+// hipResourceType
+#define hipResourceType cudaResourceType
+#define hipResourceTypeArray cudaResourceTypeArray
+#define hipResourceTypeMipmappedArray cudaResourceTypeMipmappedArray
+#define hipResourceTypeLinear cudaResourceTypeLinear
+#define hipResourceTypePitch2D cudaResourceTypePitch2D
+//
+// hipErrorNoDevice.
+
+
+//! Flags that can be used with hipEventCreateWithFlags:
+#define hipEventDefault cudaEventDefault
+#define hipEventBlockingSync cudaEventBlockingSync
+#define hipEventDisableTiming cudaEventDisableTiming
+#define hipEventInterprocess cudaEventInterprocess
+#define hipEventReleaseToDevice 0 /* no-op on CUDA platform */
+#define hipEventReleaseToSystem 0 /* no-op on CUDA platform */
+
+
+#define hipHostMallocDefault cudaHostAllocDefault
+#define hipHostMallocPortable cudaHostAllocPortable
+#define hipHostMallocMapped cudaHostAllocMapped
+#define hipHostMallocWriteCombined cudaHostAllocWriteCombined
+#define hipHostMallocCoherent 0x0
+#define hipHostMallocNonCoherent 0x0
+
+#define hipMemAttachGlobal cudaMemAttachGlobal
+#define hipMemAttachHost cudaMemAttachHost
+#define hipMemAttachSingle cudaMemAttachSingle
+
+#define hipHostRegisterDefault cudaHostRegisterDefault
+#define hipHostRegisterPortable cudaHostRegisterPortable
+#define hipHostRegisterMapped cudaHostRegisterMapped
+#define hipHostRegisterIoMemory cudaHostRegisterIoMemory
+
+#define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
+#define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE
+#define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END
+#define hipLimitMallocHeapSize cudaLimitMallocHeapSize
+#define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess
+
+#define hipOccupancyDefault cudaOccupancyDefault
+
+#define hipCooperativeLaunchMultiDeviceNoPreSync    \
+        cudaCooperativeLaunchMultiDeviceNoPreSync
+#define hipCooperativeLaunchMultiDeviceNoPostSync   \
+        cudaCooperativeLaunchMultiDeviceNoPostSync
+
+
+// enum CUjit_option redefines
+#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS
+#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK
+#define hipJitOptionWallTime CU_JIT_WALL_TIME
+#define hipJitOptionInfoLogBuffer CU_JIT_INFO_LOG_BUFFER
+#define hipJitOptionInfoLogBufferSizeBytes CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionErrorLogBuffer CU_JIT_ERROR_LOG_BUFFER
+#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionOptimizationLevel CU_JIT_OPTIMIZATION_LEVEL
+#define hipJitOptionTargetFromContext CU_JIT_TARGET_FROM_CUCONTEXT
+#define hipJitOptionTarget CU_JIT_TARGET
+#define hipJitOptionFallbackStrategy CU_JIT_FALLBACK_STRATEGY
+#define hipJitOptionGenerateDebugInfo CU_JIT_GENERATE_DEBUG_INFO
+#define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE
+#define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO
+#define hipJitOptionCacheMode CU_JIT_CACHE_MODE
+#define hipJitOptionSm3xOpt CU_JIT_NEW_SM3X_OPT
+#define hipJitOptionFastCompile CU_JIT_FAST_COMPILE
+#define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS
+
+typedef cudaEvent_t hipEvent_t;
+typedef cudaStream_t hipStream_t;
+typedef cudaIpcEventHandle_t hipIpcEventHandle_t;
+typedef cudaIpcMemHandle_t hipIpcMemHandle_t;
+typedef enum cudaLimit hipLimit_t;
+typedef enum cudaFuncAttribute hipFuncAttribute;
+typedef enum cudaFuncCache hipFuncCache_t;
+typedef CUcontext hipCtx_t;
+typedef enum cudaSharedMemConfig hipSharedMemConfig;
+typedef CUfunc_cache hipFuncCache;
+typedef CUjit_option hipJitOption;
+typedef CUdevice hipDevice_t;
+typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr;
+#define hipDevP2PAttrPerformanceRank cudaDevP2PAttrPerformanceRank
+#define hipDevP2PAttrAccessSupported cudaDevP2PAttrAccessSupported
+#define hipDevP2PAttrNativeAtomicSupported cudaDevP2PAttrNativeAtomicSupported
+#define hipDevP2PAttrHipArrayAccessSupported cudaDevP2PAttrCudaArrayAccessSupported
+#define hipFuncAttributeMaxDynamicSharedMemorySize cudaFuncAttributeMaxDynamicSharedMemorySize
+#define hipFuncAttributePreferredSharedMemoryCarveout cudaFuncAttributePreferredSharedMemoryCarveout
+
+typedef CUmodule hipModule_t;
+typedef CUfunction hipFunction_t;
+typedef CUdeviceptr hipDeviceptr_t;
+typedef struct cudaArray hipArray;
+typedef struct cudaArray* hipArray_t;
+typedef struct cudaArray* hipArray_const_t;
+typedef struct cudaFuncAttributes hipFuncAttributes;
+typedef struct cudaLaunchParams hipLaunchParams;
+#define hipFunction_attribute CUfunction_attribute
+#define hip_Memcpy2D CUDA_MEMCPY2D
+#define HIP_MEMCPY3D CUDA_MEMCPY3D
+#define hipMemcpy3DParms cudaMemcpy3DParms
+#define hipArrayDefault cudaArrayDefault
+#define hipArrayLayered cudaArrayLayered
+#define hipArraySurfaceLoadStore cudaArraySurfaceLoadStore
+#define hipArrayCubemap cudaArrayCubemap
+#define hipArrayTextureGather cudaArrayTextureGather
+
+typedef cudaTextureObject_t hipTextureObject_t;
+typedef cudaSurfaceObject_t hipSurfaceObject_t;
+#define hipTextureType1D cudaTextureType1D
+#define hipTextureType1DLayered cudaTextureType1DLayered
+#define hipTextureType2D cudaTextureType2D
+#define hipTextureType2DLayered cudaTextureType2DLayered
+#define hipTextureType3D cudaTextureType3D
+
+#define hipDeviceScheduleAuto cudaDeviceScheduleAuto
+#define hipDeviceScheduleSpin cudaDeviceScheduleSpin
+#define hipDeviceScheduleYield cudaDeviceScheduleYield
+#define hipDeviceScheduleBlockingSync cudaDeviceScheduleBlockingSync
+#define hipDeviceScheduleMask cudaDeviceScheduleMask
+#define hipDeviceMapHost cudaDeviceMapHost
+#define hipDeviceLmemResizeToMax cudaDeviceLmemResizeToMax
+
+#define hipCpuDeviceId cudaCpuDeviceId
+#define hipInvalidDeviceId cudaInvalidDeviceId
+typedef struct cudaExtent hipExtent;
+typedef struct cudaPitchedPtr hipPitchedPtr;
+#define make_hipExtent make_cudaExtent
+#define make_hipPos make_cudaPos
+#define make_hipPitchedPtr make_cudaPitchedPtr
+// Flags that can be used with hipStreamCreateWithFlags
+#define hipStreamDefault cudaStreamDefault
+#define hipStreamNonBlocking cudaStreamNonBlocking
+
+typedef struct cudaChannelFormatDesc hipChannelFormatDesc;
+typedef struct cudaResourceDesc hipResourceDesc;
+typedef struct cudaTextureDesc hipTextureDesc;
+typedef struct cudaResourceViewDesc hipResourceViewDesc;
+// adding code for hipmemSharedConfig
+#define hipSharedMemBankSizeDefault cudaSharedMemBankSizeDefault
+#define hipSharedMemBankSizeFourByte cudaSharedMemBankSizeFourByte
+#define hipSharedMemBankSizeEightByte cudaSharedMemBankSizeEightByte
+
+//Function Attributes
+#define HIP_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK
+#define HIP_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_CONST_SIZE_BYTES CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_NUM_REGS CU_FUNC_ATTRIBUTE_NUM_REGS
+#define HIP_FUNC_ATTRIBUTE_PTX_VERSION CU_FUNC_ATTRIBUTE_PTX_VERSION
+#define HIP_FUNC_ATTRIBUTE_BINARY_VERSION CU_FUNC_ATTRIBUTE_BINARY_VERSION
+#define HIP_FUNC_ATTRIBUTE_CACHE_MODE_CA CU_FUNC_ATTRIBUTE_CACHE_MODE_CA
+#define HIP_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES
+#define HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
+#define HIP_FUNC_ATTRIBUTE_MAX CU_FUNC_ATTRIBUTE_MAX
+
+#if CUDA_VERSION >= 9000
+#define __shfl(...)      __shfl_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_up(...)   __shfl_up_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_down(...) __shfl_down_sync(0xffffffff, __VA_ARGS__)
+#define __shfl_xor(...)  __shfl_xor_sync(0xffffffff, __VA_ARGS__)
+#endif // CUDA_VERSION >= 9000
+
+inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
+    switch (cuError) {
+        case cudaSuccess:
+            return hipSuccess;
+        case cudaErrorProfilerDisabled:
+            return hipErrorProfilerDisabled;
+        case cudaErrorProfilerNotInitialized:
+            return hipErrorProfilerNotInitialized;
+        case cudaErrorProfilerAlreadyStarted:
+            return hipErrorProfilerAlreadyStarted;
+        case cudaErrorProfilerAlreadyStopped:
+            return hipErrorProfilerAlreadyStopped;
+        case cudaErrorInsufficientDriver:
+            return hipErrorInsufficientDriver;
+        case cudaErrorUnsupportedLimit:
+            return hipErrorUnsupportedLimit;
+        case cudaErrorPeerAccessUnsupported:
+            return hipErrorPeerAccessUnsupported;
+        case cudaErrorInvalidGraphicsContext:
+            return hipErrorInvalidGraphicsContext;
+        case cudaErrorSharedObjectSymbolNotFound:
+            return hipErrorSharedObjectSymbolNotFound;
+        case cudaErrorSharedObjectInitFailed:
+            return hipErrorSharedObjectInitFailed;
+        case cudaErrorOperatingSystem:
+            return hipErrorOperatingSystem;
+        case cudaErrorSetOnActiveProcess:
+            return hipErrorSetOnActiveProcess;
+        case cudaErrorIllegalAddress:
+            return hipErrorIllegalAddress;
+        case cudaErrorInvalidSymbol:
+            return hipErrorInvalidSymbol;
+        case cudaErrorMissingConfiguration:
+            return hipErrorMissingConfiguration;
+        case cudaErrorMemoryAllocation:
+            return hipErrorOutOfMemory;
+        case cudaErrorInitializationError:
+            return hipErrorNotInitialized;
+        case cudaErrorLaunchFailure:
+            return hipErrorLaunchFailure;
+        case cudaErrorCooperativeLaunchTooLarge:
+            return hipErrorCooperativeLaunchTooLarge;
+        case cudaErrorPriorLaunchFailure:
+            return hipErrorPriorLaunchFailure;
+        case cudaErrorLaunchOutOfResources:
+            return hipErrorLaunchOutOfResources;
+        case cudaErrorInvalidDeviceFunction:
+            return hipErrorInvalidDeviceFunction;
+        case cudaErrorInvalidConfiguration:
+            return hipErrorInvalidConfiguration;
+        case cudaErrorInvalidDevice:
+            return hipErrorInvalidDevice;
+        case cudaErrorInvalidValue:
+            return hipErrorInvalidValue;
+        case cudaErrorInvalidDevicePointer:
+            return hipErrorInvalidDevicePointer;
+        case cudaErrorInvalidMemcpyDirection:
+            return hipErrorInvalidMemcpyDirection;
+        case cudaErrorInvalidResourceHandle:
+            return hipErrorInvalidHandle;
+        case cudaErrorNotReady:
+            return hipErrorNotReady;
+        case cudaErrorNoDevice:
+            return hipErrorNoDevice;
+        case cudaErrorPeerAccessAlreadyEnabled:
+            return hipErrorPeerAccessAlreadyEnabled;
+        case cudaErrorPeerAccessNotEnabled:
+            return hipErrorPeerAccessNotEnabled;
+        case cudaErrorHostMemoryAlreadyRegistered:
+            return hipErrorHostMemoryAlreadyRegistered;
+        case cudaErrorHostMemoryNotRegistered:
+            return hipErrorHostMemoryNotRegistered;
+        case cudaErrorMapBufferObjectFailed:
+            return hipErrorMapFailed;
+        case cudaErrorAssert:
+            return hipErrorAssert;
+        case cudaErrorNotSupported:
+            return hipErrorNotSupported;
+        case cudaErrorCudartUnloading:
+            return hipErrorDeinitialized;
+        case cudaErrorInvalidKernelImage:
+            return hipErrorInvalidImage;
+        case cudaErrorUnmapBufferObjectFailed:
+            return hipErrorUnmapFailed;
+        case cudaErrorNoKernelImageForDevice:
+            return hipErrorNoBinaryForGpu;
+        case cudaErrorECCUncorrectable:
+            return hipErrorECCNotCorrectable;
+        case cudaErrorDeviceAlreadyInUse:
+            return hipErrorContextAlreadyInUse;
+        case cudaErrorInvalidPtx:
+            return hipErrorInvalidKernelFile;
+        case cudaErrorLaunchTimeout:
+            return hipErrorLaunchTimeOut;
+#if CUDA_VERSION >= 10010
+        case cudaErrorInvalidSource:
+            return hipErrorInvalidSource;
+        case cudaErrorFileNotFound:
+            return hipErrorFileNotFound;
+        case cudaErrorSymbolNotFound:
+            return hipErrorNotFound;
+        case cudaErrorArrayIsMapped:
+            return hipErrorArrayIsMapped;
+        case cudaErrorNotMappedAsPointer:
+            return hipErrorNotMappedAsPointer;
+        case cudaErrorNotMappedAsArray:
+            return hipErrorNotMappedAsArray;
+        case cudaErrorNotMapped:
+            return hipErrorNotMapped;
+        case cudaErrorAlreadyAcquired:
+            return hipErrorAlreadyAcquired;
+        case cudaErrorAlreadyMapped:
+            return hipErrorAlreadyMapped;
+#endif
+#if CUDA_VERSION >= 10020
+        case cudaErrorDeviceUninitialized:
+            return hipErrorInvalidContext;
+#endif
+        case cudaErrorUnknown:
+        default:
+            return hipErrorUnknown;  // Note - translated error.
+    }
+}
+
+inline static hipError_t hipCUResultTohipError(CUresult cuError) {
+    switch (cuError) {
+        case CUDA_SUCCESS:
+            return hipSuccess;
+        case CUDA_ERROR_OUT_OF_MEMORY:
+            return hipErrorOutOfMemory;
+        case CUDA_ERROR_INVALID_VALUE:
+            return hipErrorInvalidValue;
+        case CUDA_ERROR_INVALID_DEVICE:
+            return hipErrorInvalidDevice;
+        case CUDA_ERROR_DEINITIALIZED:
+            return hipErrorDeinitialized;
+        case CUDA_ERROR_NO_DEVICE:
+            return hipErrorNoDevice;
+        case CUDA_ERROR_INVALID_CONTEXT:
+            return hipErrorInvalidContext;
+        case CUDA_ERROR_NOT_INITIALIZED:
+            return hipErrorNotInitialized;
+        case CUDA_ERROR_INVALID_HANDLE:
+            return hipErrorInvalidHandle;
+        case CUDA_ERROR_MAP_FAILED:
+            return hipErrorMapFailed;
+        case CUDA_ERROR_PROFILER_DISABLED:
+            return hipErrorProfilerDisabled;
+        case CUDA_ERROR_PROFILER_NOT_INITIALIZED:
+            return hipErrorProfilerNotInitialized;
+        case CUDA_ERROR_PROFILER_ALREADY_STARTED:
+            return hipErrorProfilerAlreadyStarted;
+        case CUDA_ERROR_PROFILER_ALREADY_STOPPED:
+            return hipErrorProfilerAlreadyStopped;
+        case CUDA_ERROR_INVALID_IMAGE:
+            return hipErrorInvalidImage;
+        case CUDA_ERROR_CONTEXT_ALREADY_CURRENT:
+            return hipErrorContextAlreadyCurrent;
+        case CUDA_ERROR_UNMAP_FAILED:
+            return hipErrorUnmapFailed;
+        case CUDA_ERROR_ARRAY_IS_MAPPED:
+            return hipErrorArrayIsMapped;
+        case CUDA_ERROR_ALREADY_MAPPED:
+            return hipErrorAlreadyMapped;
+        case CUDA_ERROR_NO_BINARY_FOR_GPU:
+            return hipErrorNoBinaryForGpu;
+        case CUDA_ERROR_ALREADY_ACQUIRED:
+            return hipErrorAlreadyAcquired;
+        case CUDA_ERROR_NOT_MAPPED:
+            return hipErrorNotMapped;
+        case CUDA_ERROR_NOT_MAPPED_AS_ARRAY:
+            return hipErrorNotMappedAsArray;
+        case CUDA_ERROR_NOT_MAPPED_AS_POINTER:
+            return hipErrorNotMappedAsPointer;
+        case CUDA_ERROR_ECC_UNCORRECTABLE:
+            return hipErrorECCNotCorrectable;
+        case CUDA_ERROR_UNSUPPORTED_LIMIT:
+            return hipErrorUnsupportedLimit;
+        case CUDA_ERROR_CONTEXT_ALREADY_IN_USE:
+            return hipErrorContextAlreadyInUse;
+        case CUDA_ERROR_PEER_ACCESS_UNSUPPORTED:
+            return hipErrorPeerAccessUnsupported;
+        case CUDA_ERROR_INVALID_PTX:
+            return hipErrorInvalidKernelFile;
+        case CUDA_ERROR_INVALID_GRAPHICS_CONTEXT:
+            return hipErrorInvalidGraphicsContext;
+        case CUDA_ERROR_INVALID_SOURCE:
+            return hipErrorInvalidSource;
+        case CUDA_ERROR_FILE_NOT_FOUND:
+            return hipErrorFileNotFound;
+        case CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND:
+            return hipErrorSharedObjectSymbolNotFound;
+        case CUDA_ERROR_SHARED_OBJECT_INIT_FAILED:
+            return hipErrorSharedObjectInitFailed;
+        case CUDA_ERROR_OPERATING_SYSTEM:
+            return hipErrorOperatingSystem;
+        case CUDA_ERROR_NOT_FOUND:
+            return hipErrorNotFound;
+        case CUDA_ERROR_NOT_READY:
+            return hipErrorNotReady;
+        case CUDA_ERROR_ILLEGAL_ADDRESS:
+            return hipErrorIllegalAddress;
+        case CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES:
+            return hipErrorLaunchOutOfResources;
+        case CUDA_ERROR_LAUNCH_TIMEOUT:
+            return hipErrorLaunchTimeOut;
+        case CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED:
+            return hipErrorPeerAccessAlreadyEnabled;
+        case CUDA_ERROR_PEER_ACCESS_NOT_ENABLED:
+            return hipErrorPeerAccessNotEnabled;
+        case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
+            return hipErrorSetOnActiveProcess;
+        case CUDA_ERROR_ASSERT:
+            return hipErrorAssert;
+        case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
+            return hipErrorHostMemoryAlreadyRegistered;
+        case CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED:
+            return hipErrorHostMemoryNotRegistered;
+        case CUDA_ERROR_LAUNCH_FAILED:
+            return hipErrorLaunchFailure;
+        case CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE:
+            return hipErrorCooperativeLaunchTooLarge;
+        case CUDA_ERROR_NOT_SUPPORTED:
+            return hipErrorNotSupported;
+        case CUDA_ERROR_UNKNOWN:
+        default:
+            return hipErrorUnknown;  // Note - translated error.
+    }
+}
+
+inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
+    switch (hError) {
+        case hipSuccess:
+            return cudaSuccess;
+        case hipErrorOutOfMemory:
+            return cudaErrorMemoryAllocation;
+        case hipErrorProfilerDisabled:
+          return cudaErrorProfilerDisabled;
+        case hipErrorProfilerNotInitialized:
+            return cudaErrorProfilerNotInitialized;
+        case hipErrorProfilerAlreadyStarted:
+            return cudaErrorProfilerAlreadyStarted;
+        case hipErrorProfilerAlreadyStopped:
+            return cudaErrorProfilerAlreadyStopped;
+        case hipErrorInvalidConfiguration:
+            return cudaErrorInvalidConfiguration;
+        case hipErrorLaunchOutOfResources:
+            return cudaErrorLaunchOutOfResources;
+        case hipErrorInvalidValue:
+            return cudaErrorInvalidValue;
+        case hipErrorInvalidHandle:
+            return cudaErrorInvalidResourceHandle;
+        case hipErrorInvalidDevice:
+            return cudaErrorInvalidDevice;
+        case hipErrorInvalidMemcpyDirection:
+            return cudaErrorInvalidMemcpyDirection;
+        case hipErrorInvalidDevicePointer:
+            return cudaErrorInvalidDevicePointer;
+        case hipErrorNotInitialized:
+            return cudaErrorInitializationError;
+        case hipErrorNoDevice:
+            return cudaErrorNoDevice;
+        case hipErrorNotReady:
+            return cudaErrorNotReady;
+        case hipErrorPeerAccessNotEnabled:
+            return cudaErrorPeerAccessNotEnabled;
+        case hipErrorPeerAccessAlreadyEnabled:
+            return cudaErrorPeerAccessAlreadyEnabled;
+        case hipErrorHostMemoryAlreadyRegistered:
+            return cudaErrorHostMemoryAlreadyRegistered;
+        case hipErrorHostMemoryNotRegistered:
+            return cudaErrorHostMemoryNotRegistered;
+        case hipErrorDeinitialized:
+            return cudaErrorCudartUnloading;
+        case hipErrorInvalidSymbol:
+            return cudaErrorInvalidSymbol;
+        case hipErrorInsufficientDriver:
+            return cudaErrorInsufficientDriver;
+        case hipErrorMissingConfiguration:
+            return cudaErrorMissingConfiguration;
+        case hipErrorPriorLaunchFailure:
+            return cudaErrorPriorLaunchFailure;
+        case hipErrorInvalidDeviceFunction:
+            return cudaErrorInvalidDeviceFunction;
+        case hipErrorInvalidImage:
+            return cudaErrorInvalidKernelImage;
+        case hipErrorInvalidContext:
+#if CUDA_VERSION >= 10020
+            return cudaErrorDeviceUninitialized;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorMapFailed:
+            return cudaErrorMapBufferObjectFailed;
+        case hipErrorUnmapFailed:
+            return cudaErrorUnmapBufferObjectFailed;
+        case hipErrorArrayIsMapped:
+#if CUDA_VERSION >= 10010
+            return cudaErrorArrayIsMapped;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorAlreadyMapped:
+#if CUDA_VERSION >= 10010
+            return cudaErrorAlreadyMapped;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNoBinaryForGpu:
+            return cudaErrorNoKernelImageForDevice;
+        case hipErrorAlreadyAcquired:
+#if CUDA_VERSION >= 10010
+            return cudaErrorAlreadyAcquired;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNotMapped:
+#if CUDA_VERSION >= 10010
+            return cudaErrorNotMapped;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNotMappedAsArray:
+#if CUDA_VERSION >= 10010
+            return cudaErrorNotMappedAsArray;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorNotMappedAsPointer:
+#if CUDA_VERSION >= 10010
+            return cudaErrorNotMappedAsPointer;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorECCNotCorrectable:
+            return cudaErrorECCUncorrectable;
+        case hipErrorUnsupportedLimit:
+            return cudaErrorUnsupportedLimit;
+        case hipErrorContextAlreadyInUse:
+            return cudaErrorDeviceAlreadyInUse;
+        case hipErrorPeerAccessUnsupported:
+            return cudaErrorPeerAccessUnsupported;
+        case hipErrorInvalidKernelFile:
+            return cudaErrorInvalidPtx;
+        case hipErrorInvalidGraphicsContext:
+            return cudaErrorInvalidGraphicsContext;
+        case hipErrorInvalidSource:
+#if CUDA_VERSION >= 10010
+            return cudaErrorInvalidSource;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorFileNotFound:
+#if CUDA_VERSION >= 10010
+            return cudaErrorFileNotFound;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorSharedObjectSymbolNotFound:
+            return cudaErrorSharedObjectSymbolNotFound;
+        case hipErrorSharedObjectInitFailed:
+            return cudaErrorSharedObjectInitFailed;
+        case hipErrorOperatingSystem:
+            return cudaErrorOperatingSystem;
+        case hipErrorNotFound:
+#if CUDA_VERSION >= 10010
+            return cudaErrorSymbolNotFound;
+#else
+            return cudaErrorUnknown;
+#endif
+        case hipErrorIllegalAddress:
+            return cudaErrorIllegalAddress;
+        case hipErrorLaunchTimeOut:
+            return cudaErrorLaunchTimeout;
+        case hipErrorSetOnActiveProcess:
+            return cudaErrorSetOnActiveProcess;
+        case hipErrorLaunchFailure:
+            return cudaErrorLaunchFailure;
+        case hipErrorCooperativeLaunchTooLarge:
+            return cudaErrorCooperativeLaunchTooLarge;
+        case hipErrorNotSupported:
+            return cudaErrorNotSupported;
+        // HSA: does not exist in CUDA
+        case hipErrorRuntimeMemory:
+        // HSA: does not exist in CUDA
+        case hipErrorRuntimeOther:
+        case hipErrorUnknown:
+        case hipErrorTbd:
+        default:
+            return cudaErrorUnknown;  // Note - translated error.
+    }
+}
+
+inline static enum cudaMemcpyKind hipMemcpyKindToCudaMemcpyKind(hipMemcpyKind kind) {
+    switch (kind) {
+        case hipMemcpyHostToHost:
+            return cudaMemcpyHostToHost;
+        case hipMemcpyHostToDevice:
+            return cudaMemcpyHostToDevice;
+        case hipMemcpyDeviceToHost:
+            return cudaMemcpyDeviceToHost;
+        case hipMemcpyDeviceToDevice:
+            return cudaMemcpyDeviceToDevice;
+        default:
+            return cudaMemcpyDefault;
+    }
+}
+
+inline static enum cudaTextureAddressMode hipTextureAddressModeToCudaTextureAddressMode(
+    hipTextureAddressMode kind) {
+    switch (kind) {
+        case hipAddressModeWrap:
+            return cudaAddressModeWrap;
+        case hipAddressModeClamp:
+            return cudaAddressModeClamp;
+        case hipAddressModeMirror:
+            return cudaAddressModeMirror;
+        case hipAddressModeBorder:
+            return cudaAddressModeBorder;
+        default:
+            return cudaAddressModeWrap;
+    }
+}
+
+inline static enum cudaMemRangeAttribute hipMemRangeAttributeToCudaMemRangeAttribute(
+   hipMemRangeAttribute kind) {
+   switch (kind) {
+       case hipMemRangeAttributeReadMostly:
+           return cudaMemRangeAttributeReadMostly;
+       case hipMemRangeAttributePreferredLocation:
+           return cudaMemRangeAttributePreferredLocation;
+       case hipMemRangeAttributeAccessedBy:
+           return cudaMemRangeAttributeAccessedBy;
+       case hipMemRangeAttributeLastPrefetchLocation:
+           return cudaMemRangeAttributeLastPrefetchLocation;
+       default:
+           return cudaMemRangeAttributeReadMostly;
+   }
+}
+
+inline static enum cudaMemoryAdvise hipMemoryAdviseTocudaMemoryAdvise(
+    hipMemoryAdvise kind) {
+   switch (kind) {
+       case hipMemAdviseSetReadMostly:
+           return cudaMemAdviseSetReadMostly;
+       case hipMemAdviseUnsetReadMostly :
+           return cudaMemAdviseUnsetReadMostly ;
+       case hipMemAdviseSetPreferredLocation:
+           return cudaMemAdviseSetPreferredLocation;
+       case hipMemAdviseUnsetPreferredLocation:
+           return cudaMemAdviseUnsetPreferredLocation;
+       case hipMemAdviseSetAccessedBy:
+           return cudaMemAdviseSetAccessedBy;
+       case hipMemAdviseUnsetAccessedBy:
+           return cudaMemAdviseUnsetAccessedBy;
+       default:
+           return cudaMemAdviseSetReadMostly;
+   }
+}
+
+inline static enum cudaTextureFilterMode hipTextureFilterModeToCudaTextureFilterMode(
+    hipTextureFilterMode kind) {
+    switch (kind) {
+        case hipFilterModePoint:
+            return cudaFilterModePoint;
+        case hipFilterModeLinear:
+            return cudaFilterModeLinear;
+        default:
+            return cudaFilterModePoint;
+    }
+}
+
+inline static enum cudaTextureReadMode hipTextureReadModeToCudaTextureReadMode(hipTextureReadMode kind) {
+    switch (kind) {
+        case hipReadModeElementType:
+            return cudaReadModeElementType;
+        case hipReadModeNormalizedFloat:
+            return cudaReadModeNormalizedFloat;
+        default:
+            return cudaReadModeElementType;
+    }
+}
+
+inline static enum cudaChannelFormatKind hipChannelFormatKindToCudaChannelFormatKind(
+    hipChannelFormatKind kind) {
+    switch (kind) {
+        case hipChannelFormatKindSigned:
+            return cudaChannelFormatKindSigned;
+        case hipChannelFormatKindUnsigned:
+            return cudaChannelFormatKindUnsigned;
+        case hipChannelFormatKindFloat:
+            return cudaChannelFormatKindFloat;
+        case hipChannelFormatKindNone:
+            return cudaChannelFormatKindNone;
+        default:
+            return cudaChannelFormatKindNone;
+    }
+}
+
+/**
+ * Stream CallBack struct
+ */
+#define HIPRT_CB CUDART_CB
+typedef void(HIPRT_CB* hipStreamCallback_t)(hipStream_t stream, hipError_t status, void* userData);
+inline static hipError_t hipInit(unsigned int flags) {
+    return hipCUResultTohipError(cuInit(flags));
+}
+
+inline static hipError_t hipDeviceReset() { return hipCUDAErrorTohipError(cudaDeviceReset()); }
+
+inline static hipError_t hipGetLastError() { return hipCUDAErrorTohipError(cudaGetLastError()); }
+
+inline static hipError_t hipPeekAtLastError() {
+    return hipCUDAErrorTohipError(cudaPeekAtLastError());
+}
+
+inline static hipError_t hipMalloc(void** ptr, size_t size) {
+    return hipCUDAErrorTohipError(cudaMalloc(ptr, size));
+}
+
+inline static hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height) {
+    return hipCUDAErrorTohipError(cudaMallocPitch(ptr, pitch, width, height));
+}
+
+inline static hipError_t hipMemAllocPitch(hipDeviceptr_t* dptr,size_t* pitch,size_t widthInBytes,size_t height,unsigned int elementSizeBytes){
+    return hipCUResultTohipError(cuMemAllocPitch(dptr,pitch,widthInBytes,height,elementSizeBytes));
+}
+
+inline static hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent extent) {
+    return hipCUDAErrorTohipError(cudaMalloc3D(pitchedDevPtr, extent));
+}
+
+inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); }
+
+inline static hipError_t hipMallocHost(void** ptr, size_t size)
+    __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipMallocHost(void** ptr, size_t size) {
+    return hipCUDAErrorTohipError(cudaMallocHost(ptr, size));
+}
+
+inline static hipError_t hipMemAllocHost(void** ptr, size_t size)
+    __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipMemAllocHost(void** ptr, size_t size) {
+    return hipCUResultTohipError(cuMemAllocHost(ptr, size));
+}
+
+inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags)
+    __attribute__((deprecated("use hipHostMalloc instead")));
+inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
+}
+
+inline static hipError_t hipHostMalloc(void** ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
+}
+
+inline static hipError_t hipMemAdvise(const void* dev_ptr, size_t count, hipMemoryAdvise advice,
+                                      int device) {
+    return hipCUDAErrorTohipError(cudaMemAdvise(dev_ptr, count,
+        hipMemoryAdviseTocudaMemoryAdvise(advice), device));
+}
+
+inline static hipError_t hipMemPrefetchAsync(const void* dev_ptr, size_t count, int device,
+                                             hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemPrefetchAsync(dev_ptr, count, device, stream));
+}
+
+inline static hipError_t hipMemRangeGetAttribute(void* data, size_t data_size,
+                                                 hipMemRangeAttribute attribute,
+                                                 const void* dev_ptr, size_t count) {
+    return hipCUDAErrorTohipError(cudaMemRangeGetAttribute(data, data_size,
+        hipMemRangeAttributeToCudaMemRangeAttribute(attribute), dev_ptr, count));
+}
+
+inline static hipError_t hipMemRangeGetAttributes(void** data, size_t* data_sizes,
+                                                  hipMemRangeAttribute* attributes,
+                                                  size_t num_attributes, const void* dev_ptr,
+                                                  size_t count) {
+    return hipCUDAErrorTohipError(cudaMemRangeGetAttributes(data, data_sizes, attributes,
+        num_attributes, dev_ptr, count));
+}
+
+inline static hipError_t hipStreamAttachMemAsync(hipStream_t stream, hipDeviceptr_t* dev_ptr,
+                                                 size_t length __dparm(0),
+                                                 unsigned int flags __dparm(hipMemAttachSingle)) {
+    return hipCUDAErrorTohipError(cudaStreamAttachMemAsync(stream, dev_ptr, length, flags));
+}
+
+inline static hipError_t hipMallocManaged(void** ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaMallocManaged(ptr, size, flags));
+}
+
+inline static hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
+                                        size_t width, size_t height,
+                                        unsigned int flags __dparm(hipArrayDefault)) {
+    return hipCUDAErrorTohipError(cudaMallocArray(array, desc, width, height, flags));
+}
+
+inline static hipError_t hipMalloc3DArray(hipArray** array, const hipChannelFormatDesc* desc,
+                             hipExtent extent, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaMalloc3DArray(array, desc, extent, flags));
+}
+
+inline static hipError_t hipFreeArray(hipArray* array) {
+    return hipCUDAErrorTohipError(cudaFreeArray(array));
+}
+
+inline static hipError_t hipHostGetDevicePointer(void** devPtr, void* hostPtr, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostGetDevicePointer(devPtr, hostPtr, flags));
+}
+
+inline static hipError_t hipHostGetFlags(unsigned int* flagsPtr, void* hostPtr) {
+    return hipCUDAErrorTohipError(cudaHostGetFlags(flagsPtr, hostPtr));
+}
+
+inline static hipError_t hipHostRegister(void* ptr, size_t size, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaHostRegister(ptr, size, flags));
+}
+
+inline static hipError_t hipHostUnregister(void* ptr) {
+    return hipCUDAErrorTohipError(cudaHostUnregister(ptr));
+}
+
+inline static hipError_t hipFreeHost(void* ptr)
+    __attribute__((deprecated("use hipHostFree instead")));
+inline static hipError_t hipFreeHost(void* ptr) {
+    return hipCUDAErrorTohipError(cudaFreeHost(ptr));
+}
+
+inline static hipError_t hipHostFree(void* ptr) {
+    return hipCUDAErrorTohipError(cudaFreeHost(ptr));
+}
+
+inline static hipError_t hipSetDevice(int device) {
+    return hipCUDAErrorTohipError(cudaSetDevice(device));
+}
+
+inline static hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop) {
+    struct cudaDeviceProp cdprop;
+    memset(&cdprop, 0x0, sizeof(struct cudaDeviceProp));
+    cdprop.major = prop->major;
+    cdprop.minor = prop->minor;
+    cdprop.totalGlobalMem = prop->totalGlobalMem;
+    cdprop.sharedMemPerBlock = prop->sharedMemPerBlock;
+    cdprop.regsPerBlock = prop->regsPerBlock;
+    cdprop.warpSize = prop->warpSize;
+    cdprop.maxThreadsPerBlock = prop->maxThreadsPerBlock;
+    cdprop.clockRate = prop->clockRate;
+    cdprop.totalConstMem = prop->totalConstMem;
+    cdprop.multiProcessorCount = prop->multiProcessorCount;
+    cdprop.l2CacheSize = prop->l2CacheSize;
+    cdprop.maxThreadsPerMultiProcessor = prop->maxThreadsPerMultiProcessor;
+    cdprop.computeMode = prop->computeMode;
+    cdprop.canMapHostMemory = prop->canMapHostMemory;
+    cdprop.memoryClockRate = prop->memoryClockRate;
+    cdprop.memoryBusWidth = prop->memoryBusWidth;
+    return hipCUDAErrorTohipError(cudaChooseDevice(device, &cdprop));
+}
+
+inline static hipError_t hipMemcpyHtoD(hipDeviceptr_t dst, void* src, size_t size) {
+    return hipCUResultTohipError(cuMemcpyHtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoH(void* dst, hipDeviceptr_t src, size_t size) {
+    return hipCUResultTohipError(cuMemcpyDtoH(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyDtoD(hipDeviceptr_t dst, hipDeviceptr_t src, size_t size) {
+    return hipCUResultTohipError(cuMemcpyDtoD(dst, src, size));
+}
+
+inline static hipError_t hipMemcpyHtoDAsync(hipDeviceptr_t dst, void* src, size_t size,
+                                            hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpyHtoDAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpyDtoHAsync(void* dst, hipDeviceptr_t src, size_t size,
+                                            hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpyDtoHAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t src, size_t size,
+                                            hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpyDtoDAsync(dst, src, size, stream));
+}
+
+inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
+                                   hipMemcpyKind copyKind) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
+}
+
+
+inline static hipError_t hipMemcpyWithStream(void* dst, const void* src,
+				      size_t sizeBytes, hipMemcpyKind copyKind,
+				      hipStream_t stream) {
+	cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes, 
+										hipMemcpyKindToCudaMemcpyKind(copyKind),
+										stream);
+	
+	if (error != cudaSuccess) return hipCUDAErrorTohipError(error);
+	
+	return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
+}
+
+inline static hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
+                                        hipMemcpyKind copyKind, hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpyAsync(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind), stream));
+}
+
+inline static hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeBytes,
+                                           size_t offset __dparm(0),
+                                           hipMemcpyKind copyType __dparm(hipMemcpyHostToDevice)) {
+    return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset,
+                                                     hipMemcpyKindToCudaMemcpyKind(copyType)));
+}
+
+inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src,
+                                                size_t sizeBytes, size_t offset,
+                                                hipMemcpyKind copyType,
+                                                hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(
+        symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType), stream));
+}
+
+inline static hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t sizeBytes,
+                                             size_t offset __dparm(0),
+                                             hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
+    return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset,
+                                                       hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName,
+                                                  size_t sizeBytes, size_t offset,
+                                                  hipMemcpyKind kind,
+                                                  hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemcpyFromSymbolAsync(
+        dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind), stream));
+}
+
+inline static hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
+    return hipCUDAErrorTohipError(cudaGetSymbolAddress(devPtr, symbolName));
+}
+
+inline static hipError_t hipGetSymbolSize(size_t* size, const void* symbolName) {
+    return hipCUDAErrorTohipError(cudaGetSymbolSize(size, symbolName));
+}
+
+inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
+                                     size_t width, size_t height, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
+  return hipCUResultTohipError(cuMemcpy2D(pCopy));
+}
+
+inline static hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0)) {
+  return hipCUResultTohipError(cuMemcpy2DAsync(pCopy, stream));
+}
+
+inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) {
+    return hipCUDAErrorTohipError(cudaMemcpy3D(p));
+}
+
+inline static hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms *p, hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy3DAsync(p, stream));
+}
+
+inline static hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy) {
+    return hipCUResultTohipError(cuMemcpy3D(pCopy));
+}
+
+inline static hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) {
+    return hipCUResultTohipError(cuMemcpy3DAsync(pCopy, stream));
+}
+
+inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch,
+                                          size_t width, size_t height, hipMemcpyKind kind,
+                                          hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height,
+                                                    hipMemcpyKindToCudaMemcpyKind(kind), stream));
+}
+
+inline static hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray* src,
+                                              size_t wOffset, size_t hOffset, size_t width,
+                                              size_t height, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DFromArray(dst, dpitch, src, wOffset, hOffset, width,
+                                                        height,
+                                                        hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray* src,
+                                                   size_t wOffset, size_t hOffset, size_t width,
+                                                   size_t height, hipMemcpyKind kind,
+                                                   hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DFromArrayAsync(dst, dpitch, src, wOffset, hOffset,
+                                                             width, height,
+                                                             hipMemcpyKindToCudaMemcpyKind(kind),
+                                                             stream));
+}
+
+inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset,
+                                            const void* src, size_t spitch, size_t width,
+                                            size_t height, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width,
+                                                      height, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset,
+                                                 const void* src, size_t spitch, size_t width,
+                                                 size_t height, hipMemcpyKind kind,
+                                                 hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, src, spitch,
+                                                           width, height,
+                                                           hipMemcpyKindToCudaMemcpyKind(kind),
+                                                           stream));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset,
+                                                           size_t hOffset, const void* src,
+                                                           size_t count, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray,
+                                                             size_t wOffset, size_t hOffset,
+                                                             size_t count, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count,
+                                                      hipMemcpyKindToCudaMemcpyKind(kind)));
+}
+
+inline static hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset,
+                                       size_t count) {
+    return hipCUResultTohipError(cuMemcpyAtoH(dst, (CUarray)srcArray, srcOffset, count));
+}
+
+inline static hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost,
+                                       size_t count) {
+    return hipCUResultTohipError(cuMemcpyHtoA((CUarray)dstArray, dstOffset, srcHost, count));
+}
+
+inline static hipError_t hipDeviceSynchronize() {
+    return hipCUDAErrorTohipError(cudaDeviceSynchronize());
+}
+
+inline static hipError_t hipDeviceGetCacheConfig(hipFuncCache_t* pCacheConfig) {
+    return hipCUDAErrorTohipError(cudaDeviceGetCacheConfig(pCacheConfig));
+}
+
+inline static hipError_t hipFuncSetAttribute(const void* func, hipFuncAttribute attr, int value) {
+    return hipCUDAErrorTohipError(cudaFuncSetAttribute(func, attr, value));
+}
+
+inline static hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig) {
+    return hipCUDAErrorTohipError(cudaDeviceSetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipFuncSetSharedMemConfig(const void* func, hipSharedMemConfig config) {
+    return hipCUDAErrorTohipError(cudaFuncSetSharedMemConfig(func, config));
+}
+
+inline static const char* hipGetErrorString(hipError_t error) {
+    return cudaGetErrorString(hipErrorToCudaError(error));
+}
+
+inline static const char* hipGetErrorName(hipError_t error) {
+    return cudaGetErrorName(hipErrorToCudaError(error));
+}
+
+inline static hipError_t hipGetDeviceCount(int* count) {
+    return hipCUDAErrorTohipError(cudaGetDeviceCount(count));
+}
+
+inline static hipError_t hipGetDevice(int* device) {
+    return hipCUDAErrorTohipError(cudaGetDevice(device));
+}
+
+inline static hipError_t hipIpcCloseMemHandle(void* devPtr) {
+    return hipCUDAErrorTohipError(cudaIpcCloseMemHandle(devPtr));
+}
+
+inline static hipError_t hipIpcGetEventHandle(hipIpcEventHandle_t* handle, hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaIpcGetEventHandle(handle, event));
+}
+
+inline static hipError_t hipIpcGetMemHandle(hipIpcMemHandle_t* handle, void* devPtr) {
+    return hipCUDAErrorTohipError(cudaIpcGetMemHandle(handle, devPtr));
+}
+
+inline static hipError_t hipIpcOpenEventHandle(hipEvent_t* event, hipIpcEventHandle_t handle) {
+    return hipCUDAErrorTohipError(cudaIpcOpenEventHandle(event, handle));
+}
+
+inline static hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle,
+                                             unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaIpcOpenMemHandle(devPtr, handle, flags));
+}
+
+inline static hipError_t hipMemset(void* devPtr, int value, size_t count) {
+    return hipCUDAErrorTohipError(cudaMemset(devPtr, value, count));
+}
+
+inline static hipError_t hipMemsetD32(hipDeviceptr_t devPtr, int value, size_t count) {
+    return hipCUResultTohipError(cuMemsetD32(devPtr, value, count));
+}
+
+inline static hipError_t hipMemsetAsync(void* devPtr, int value, size_t count,
+                                        hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemsetAsync(devPtr, value, count, stream));
+}
+
+inline static hipError_t hipMemsetD32Async(hipDeviceptr_t devPtr, int value, size_t count,
+                                           hipStream_t stream __dparm(0)) {
+    return hipCUResultTohipError(cuMemsetD32Async(devPtr, value, count, stream));
+}
+
+inline static hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes) {
+    return hipCUResultTohipError(cuMemsetD8(dest, value, sizeBytes));
+}
+
+inline static hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes,
+                                          hipStream_t stream __dparm(0)) {
+    return hipCUResultTohipError(cuMemsetD8Async(dest, value, sizeBytes, stream));
+}
+
+inline static hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes) {
+    return hipCUResultTohipError(cuMemsetD16(dest, value, sizeBytes));
+}
+
+inline static hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes,
+                                           hipStream_t stream __dparm(0)) {
+    return hipCUResultTohipError(cuMemsetD16Async(dest, value, sizeBytes, stream));
+}
+
+inline static hipError_t hipMemset2D(void* dst, size_t pitch, int value, size_t width, size_t height) {
+    return hipCUDAErrorTohipError(cudaMemset2D(dst, pitch, value, width, height));
+}
+
+inline static hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, size_t height, hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaMemset2DAsync(dst, pitch, value, width, height, stream));
+}
+
+inline static hipError_t hipMemset3D(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent ){
+    return hipCUDAErrorTohipError(cudaMemset3D(pitchedDevPtr, value, extent));
+}
+
+inline static hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int  value, hipExtent extent, hipStream_t stream __dparm(0) ){
+    return hipCUDAErrorTohipError(cudaMemset3DAsync(pitchedDevPtr, value, extent, stream));
+}
+
+inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int device) {
+    struct cudaDeviceProp cdprop;
+    cudaError_t cerror;
+    cerror = cudaGetDeviceProperties(&cdprop, device);
+
+    strncpy(p_prop->name, cdprop.name, 256);
+    p_prop->totalGlobalMem = cdprop.totalGlobalMem;
+    p_prop->sharedMemPerBlock = cdprop.sharedMemPerBlock;
+    p_prop->regsPerBlock = cdprop.regsPerBlock;
+    p_prop->warpSize = cdprop.warpSize;
+    p_prop->maxThreadsPerBlock = cdprop.maxThreadsPerBlock;
+    for (int i = 0; i < 3; i++) {
+        p_prop->maxThreadsDim[i] = cdprop.maxThreadsDim[i];
+        p_prop->maxGridSize[i] = cdprop.maxGridSize[i];
+    }
+    p_prop->clockRate = cdprop.clockRate;
+    p_prop->memoryClockRate = cdprop.memoryClockRate;
+    p_prop->memoryBusWidth = cdprop.memoryBusWidth;
+    p_prop->totalConstMem = cdprop.totalConstMem;
+    p_prop->major = cdprop.major;
+    p_prop->minor = cdprop.minor;
+    p_prop->multiProcessorCount = cdprop.multiProcessorCount;
+    p_prop->l2CacheSize = cdprop.l2CacheSize;
+    p_prop->maxThreadsPerMultiProcessor = cdprop.maxThreadsPerMultiProcessor;
+    p_prop->computeMode = cdprop.computeMode;
+    p_prop->clockInstructionRate = cdprop.clockRate; // Same as clock-rate:
+
+    int ccVers = p_prop->major * 100 + p_prop->minor * 10;
+    p_prop->arch.hasGlobalInt32Atomics = (ccVers >= 110);
+    p_prop->arch.hasGlobalFloatAtomicExch = (ccVers >= 110);
+    p_prop->arch.hasSharedInt32Atomics = (ccVers >= 120);
+    p_prop->arch.hasSharedFloatAtomicExch = (ccVers >= 120);
+    p_prop->arch.hasFloatAtomicAdd = (ccVers >= 200);
+    p_prop->arch.hasGlobalInt64Atomics = (ccVers >= 120);
+    p_prop->arch.hasSharedInt64Atomics = (ccVers >= 110);
+    p_prop->arch.hasDoubles = (ccVers >= 130);
+    p_prop->arch.hasWarpVote = (ccVers >= 120);
+    p_prop->arch.hasWarpBallot = (ccVers >= 200);
+    p_prop->arch.hasWarpShuffle = (ccVers >= 300);
+    p_prop->arch.hasFunnelShift = (ccVers >= 350);
+    p_prop->arch.hasThreadFenceSystem = (ccVers >= 200);
+    p_prop->arch.hasSyncThreadsExt = (ccVers >= 200);
+    p_prop->arch.hasSurfaceFuncs = (ccVers >= 200);
+    p_prop->arch.has3dGrid = (ccVers >= 200);
+    p_prop->arch.hasDynamicParallelism = (ccVers >= 350);
+
+    p_prop->concurrentKernels = cdprop.concurrentKernels;
+    p_prop->pciDomainID = cdprop.pciDomainID;
+    p_prop->pciBusID = cdprop.pciBusID;
+    p_prop->pciDeviceID = cdprop.pciDeviceID;
+    p_prop->maxSharedMemoryPerMultiProcessor = cdprop.sharedMemPerMultiprocessor;
+    p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard;
+    p_prop->canMapHostMemory = cdprop.canMapHostMemory;
+    p_prop->gcnArch = 0; // Not a GCN arch
+    p_prop->integrated = cdprop.integrated;
+    p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
+    p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
+    p_prop->cooperativeMultiDeviceUnmatchedFunc = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0;
+    p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0;
+
+    p_prop->maxTexture1D    = cdprop.maxTexture1D;
+    p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0];
+    p_prop->maxTexture2D[1] = cdprop.maxTexture2D[1];
+    p_prop->maxTexture3D[0] = cdprop.maxTexture3D[0];
+    p_prop->maxTexture3D[1] = cdprop.maxTexture3D[1];
+    p_prop->maxTexture3D[2] = cdprop.maxTexture3D[2];
+
+    p_prop->memPitch                 = cdprop.memPitch;
+    p_prop->textureAlignment         = cdprop.textureAlignment;
+    p_prop->texturePitchAlignment    = cdprop.texturePitchAlignment;
+    p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
+    p_prop->ECCEnabled               = cdprop.ECCEnabled;
+    p_prop->tccDriver                = cdprop.tccDriver;
+
+    return hipCUDAErrorTohipError(cerror);
+}
+
+inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) {
+    enum cudaDeviceAttr cdattr;
+    cudaError_t cerror;
+
+    switch (attr) {
+        case hipDeviceAttributeMaxThreadsPerBlock:
+            cdattr = cudaDevAttrMaxThreadsPerBlock;
+            break;
+        case hipDeviceAttributeMaxBlockDimX:
+            cdattr = cudaDevAttrMaxBlockDimX;
+            break;
+        case hipDeviceAttributeMaxBlockDimY:
+            cdattr = cudaDevAttrMaxBlockDimY;
+            break;
+        case hipDeviceAttributeMaxBlockDimZ:
+            cdattr = cudaDevAttrMaxBlockDimZ;
+            break;
+        case hipDeviceAttributeMaxGridDimX:
+            cdattr = cudaDevAttrMaxGridDimX;
+            break;
+        case hipDeviceAttributeMaxGridDimY:
+            cdattr = cudaDevAttrMaxGridDimY;
+            break;
+        case hipDeviceAttributeMaxGridDimZ:
+            cdattr = cudaDevAttrMaxGridDimZ;
+            break;
+        case hipDeviceAttributeMaxSharedMemoryPerBlock:
+            cdattr = cudaDevAttrMaxSharedMemoryPerBlock;
+            break;
+        case hipDeviceAttributeTotalConstantMemory:
+            cdattr = cudaDevAttrTotalConstantMemory;
+            break;
+        case hipDeviceAttributeWarpSize:
+            cdattr = cudaDevAttrWarpSize;
+            break;
+        case hipDeviceAttributeMaxRegistersPerBlock:
+            cdattr = cudaDevAttrMaxRegistersPerBlock;
+            break;
+        case hipDeviceAttributeClockRate:
+            cdattr = cudaDevAttrClockRate;
+            break;
+        case hipDeviceAttributeMemoryClockRate:
+            cdattr = cudaDevAttrMemoryClockRate;
+            break;
+        case hipDeviceAttributeMemoryBusWidth:
+            cdattr = cudaDevAttrGlobalMemoryBusWidth;
+            break;
+        case hipDeviceAttributeMultiprocessorCount:
+            cdattr = cudaDevAttrMultiProcessorCount;
+            break;
+        case hipDeviceAttributeComputeMode:
+            cdattr = cudaDevAttrComputeMode;
+            break;
+        case hipDeviceAttributeL2CacheSize:
+            cdattr = cudaDevAttrL2CacheSize;
+            break;
+        case hipDeviceAttributeMaxThreadsPerMultiProcessor:
+            cdattr = cudaDevAttrMaxThreadsPerMultiProcessor;
+            break;
+        case hipDeviceAttributeComputeCapabilityMajor:
+            cdattr = cudaDevAttrComputeCapabilityMajor;
+            break;
+        case hipDeviceAttributeComputeCapabilityMinor:
+            cdattr = cudaDevAttrComputeCapabilityMinor;
+            break;
+        case hipDeviceAttributeConcurrentKernels:
+            cdattr = cudaDevAttrConcurrentKernels;
+            break;
+        case hipDeviceAttributePciBusId:
+            cdattr = cudaDevAttrPciBusId;
+            break;
+        case hipDeviceAttributePciDeviceId:
+            cdattr = cudaDevAttrPciDeviceId;
+            break;
+        case hipDeviceAttributeMaxSharedMemoryPerMultiprocessor:
+            cdattr = cudaDevAttrMaxSharedMemoryPerMultiprocessor;
+            break;
+        case hipDeviceAttributeIsMultiGpuBoard:
+            cdattr = cudaDevAttrIsMultiGpuBoard;
+            break;
+        case hipDeviceAttributeIntegrated:
+            cdattr = cudaDevAttrIntegrated;
+            break;
+        case hipDeviceAttributeMaxTexture1DWidth:
+            cdattr = cudaDevAttrMaxTexture1DWidth;
+            break;
+        case hipDeviceAttributeMaxTexture2DWidth:
+            cdattr = cudaDevAttrMaxTexture2DWidth;
+            break;
+        case hipDeviceAttributeMaxTexture2DHeight:
+            cdattr = cudaDevAttrMaxTexture2DHeight;
+            break;
+        case hipDeviceAttributeMaxTexture3DWidth:
+            cdattr = cudaDevAttrMaxTexture3DWidth;
+            break;
+        case hipDeviceAttributeMaxTexture3DHeight:
+            cdattr = cudaDevAttrMaxTexture3DHeight;
+            break;
+        case hipDeviceAttributeMaxTexture3DDepth:
+            cdattr = cudaDevAttrMaxTexture3DDepth;
+            break;
+        case hipDeviceAttributeMaxPitch:
+            cdattr = cudaDevAttrMaxPitch;
+            break;
+        case hipDeviceAttributeTextureAlignment:
+            cdattr = cudaDevAttrTextureAlignment;
+            break;
+        case hipDeviceAttributeTexturePitchAlignment:
+            cdattr = cudaDevAttrTexturePitchAlignment;
+            break;
+        case hipDeviceAttributeKernelExecTimeout:
+            cdattr = cudaDevAttrKernelExecTimeout;
+            break;
+        case hipDeviceAttributeCanMapHostMemory:
+            cdattr = cudaDevAttrCanMapHostMemory;
+            break;
+        case hipDeviceAttributeEccEnabled:
+            cdattr = cudaDevAttrEccEnabled;
+            break;
+        case hipDeviceAttributeCooperativeLaunch:
+            cdattr = cudaDevAttrCooperativeLaunch;
+            break;
+        case hipDeviceAttributeCooperativeMultiDeviceLaunch:
+            cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
+            break;
+        case hipDeviceAttributeConcurrentManagedAccess:
+            cdattr = cudaDevAttrConcurrentManagedAccess;
+            break;
+        case hipDeviceAttributeManagedMemory:
+            cdattr = cudaDevAttrManagedMemory;
+            break;
+        case hipDeviceAttributePageableMemoryAccessUsesHostPageTables:
+            cdattr = cudaDevAttrPageableMemoryAccessUsesHostPageTables;
+            break;
+        case hipDeviceAttributePageableMemoryAccess:
+            cdattr = cudaDevAttrPageableMemoryAccess;
+            break;
+        case hipDeviceAttributeDirectManagedMemAccessFromHost:
+            cdattr = cudaDevAttrDirectManagedMemAccessFromHost;
+            break;
+        default:
+            return hipCUDAErrorTohipError(cudaErrorInvalidValue);
+    }
+
+    cerror = cudaDeviceGetAttribute(pi, cdattr, device);
+
+    return hipCUDAErrorTohipError(cerror);
+}
+
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                                                      const void* func,
+                                                                      int blockSize,
+                                                                      size_t dynamicSMemSize) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func,
+                                                              blockSize, dynamicSMemSize));
+}
+
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+                                                                      const void* func,
+                                                                      int blockSize,
+                                                                      size_t dynamicSMemSize,
+                                                                      unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
+                                                      blockSize, dynamicSMemSize, flags));
+}
+
+inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, 
+                                                                 hipFunction_t f,
+                                                                 int  blockSize,
+                                                                 size_t dynamicSMemSize ){
+    return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f,
+                                                                   blockSize, dynamicSMemSize));
+}
+
+inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(int* numBlocks,
+                                                                          hipFunction_t f,
+                                                                          int  blockSize,
+                                                                          size_t dynamicSMemSize,
+                                                                          unsigned int  flags ) {
+    return hipCUResultTohipError(cuOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks,f,
+                                                                blockSize, dynamicSMemSize, flags));
+}
+
+//TODO - Match CUoccupancyB2DSize
+inline static hipError_t hipModuleOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize,
+                                             hipFunction_t f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit){
+    return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, NULL,
+                                 dynSharedMemPerBlk, blockSizeLimit));
+}
+
+//TODO - Match CUoccupancyB2DSize
+inline static hipError_t hipModuleOccupancyMaxPotentialBlockSizeWithFlags(int* gridSize, int* blockSize,
+                                             hipFunction_t f, size_t dynSharedMemPerBlk,
+                                             int blockSizeLimit, unsigned int  flags){
+    return hipCUResultTohipError(cuOccupancyMaxPotentialBlockSizeWithFlags(gridSize, blockSize, f, NULL,
+                                 dynSharedMemPerBlk, blockSizeLimit, flags));
+}
+
+inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attributes, const void* ptr) {
+    struct cudaPointerAttributes cPA;
+    hipError_t err = hipCUDAErrorTohipError(cudaPointerGetAttributes(&cPA, ptr));
+    if (err == hipSuccess) {
+#if (CUDART_VERSION >= 11000)
+        auto memType = cPA.type;
+#else
+        unsigned memType = cPA.memoryType; // No auto because cuda 10.2 doesnt force c++11
+#endif
+        switch (memType) {
+            case cudaMemoryTypeDevice:
+                attributes->memoryType = hipMemoryTypeDevice;
+                break;
+            case cudaMemoryTypeHost:
+                attributes->memoryType = hipMemoryTypeHost;
+                break;
+            default:
+                return hipErrorUnknown;
+        }
+        attributes->device = cPA.device;
+        attributes->devicePointer = cPA.devicePointer;
+        attributes->hostPointer = cPA.hostPointer;
+        attributes->isManaged = 0;
+        attributes->allocationFlags = 0;
+    }
+    return err;
+}
+
+inline static hipError_t hipMemGetInfo(size_t* free, size_t* total) {
+    return hipCUDAErrorTohipError(cudaMemGetInfo(free, total));
+}
+
+inline static hipError_t hipEventCreate(hipEvent_t* event) {
+    return hipCUDAErrorTohipError(cudaEventCreate(event));
+}
+
+inline static hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream __dparm(NULL)) {
+    return hipCUDAErrorTohipError(cudaEventRecord(event, stream));
+}
+
+inline static hipError_t hipEventSynchronize(hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaEventSynchronize(event));
+}
+
+inline static hipError_t hipEventElapsedTime(float* ms, hipEvent_t start, hipEvent_t stop) {
+    return hipCUDAErrorTohipError(cudaEventElapsedTime(ms, start, stop));
+}
+
+inline static hipError_t hipEventDestroy(hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaEventDestroy(event));
+}
+
+inline static hipError_t hipStreamCreateWithFlags(hipStream_t* stream, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaStreamCreateWithFlags(stream, flags));
+}
+
+inline static hipError_t hipStreamCreateWithPriority(hipStream_t* stream, unsigned int flags, int priority) {
+    return hipCUDAErrorTohipError(cudaStreamCreateWithPriority(stream, flags, priority));
+}
+
+inline static hipError_t hipDeviceGetStreamPriorityRange(int* leastPriority, int* greatestPriority) {
+    return hipCUDAErrorTohipError(cudaDeviceGetStreamPriorityRange(leastPriority, greatestPriority));
+}
+
+inline static hipError_t hipStreamCreate(hipStream_t* stream) {
+    return hipCUDAErrorTohipError(cudaStreamCreate(stream));
+}
+
+inline static hipError_t hipStreamSynchronize(hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
+}
+
+inline static hipError_t hipStreamDestroy(hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaStreamDestroy(stream));
+}
+
+inline static hipError_t hipStreamGetFlags(hipStream_t stream, unsigned int *flags) {
+    return hipCUDAErrorTohipError(cudaStreamGetFlags(stream, flags));
+}
+
+inline static hipError_t hipStreamGetPriority(hipStream_t stream, int *priority) {
+    return hipCUDAErrorTohipError(cudaStreamGetPriority(stream, priority));
+}
+
+inline static hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event,
+                                            unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaStreamWaitEvent(stream, event, flags));
+}
+
+inline static hipError_t hipStreamQuery(hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaStreamQuery(stream));
+}
+
+inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback,
+                                              void* userData, unsigned int flags) {
+    return hipCUDAErrorTohipError(
+        cudaStreamAddCallback(stream, (cudaStreamCallback_t)callback, userData, flags));
+}
+
+inline static hipError_t hipDriverGetVersion(int* driverVersion) {
+    cudaError_t err = cudaDriverGetVersion(driverVersion);
+
+    // Override driver version to match version reported on HCC side.
+    *driverVersion = 4;
+
+    return hipCUDAErrorTohipError(err);
+}
+
+inline static hipError_t hipRuntimeGetVersion(int* runtimeVersion) {
+    return hipCUDAErrorTohipError(cudaRuntimeGetVersion(runtimeVersion));
+}
+
+inline static hipError_t hipDeviceCanAccessPeer(int* canAccessPeer, int device, int peerDevice) {
+    return hipCUDAErrorTohipError(cudaDeviceCanAccessPeer(canAccessPeer, device, peerDevice));
+}
+
+inline static hipError_t hipDeviceDisablePeerAccess(int peerDevice) {
+    return hipCUDAErrorTohipError(cudaDeviceDisablePeerAccess(peerDevice));
+}
+
+inline static hipError_t hipDeviceEnablePeerAccess(int peerDevice, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaDeviceEnablePeerAccess(peerDevice, flags));
+}
+
+inline static hipError_t hipCtxDisablePeerAccess(hipCtx_t peerCtx) {
+    return hipCUResultTohipError(cuCtxDisablePeerAccess(peerCtx));
+}
+
+inline static hipError_t hipCtxEnablePeerAccess(hipCtx_t peerCtx, unsigned int flags) {
+    return hipCUResultTohipError(cuCtxEnablePeerAccess(peerCtx, flags));
+}
+
+inline static hipError_t hipDevicePrimaryCtxGetState(hipDevice_t dev, unsigned int* flags,
+                                                     int* active) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxGetState(dev, flags, active));
+}
+
+inline static hipError_t hipDevicePrimaryCtxRelease(hipDevice_t dev) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxRelease(dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxRetain(hipCtx_t* pctx, hipDevice_t dev) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxRetain(pctx, dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxReset(hipDevice_t dev) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxReset(dev));
+}
+
+inline static hipError_t hipDevicePrimaryCtxSetFlags(hipDevice_t dev, unsigned int flags) {
+    return hipCUResultTohipError(cuDevicePrimaryCtxSetFlags(dev, flags));
+}
+
+inline static hipError_t hipMemGetAddressRange(hipDeviceptr_t* pbase, size_t* psize,
+                                               hipDeviceptr_t dptr) {
+    return hipCUResultTohipError(cuMemGetAddressRange(pbase, psize, dptr));
+}
+
+inline static hipError_t hipMemcpyPeer(void* dst, int dstDevice, const void* src, int srcDevice,
+                                       size_t count) {
+    return hipCUDAErrorTohipError(cudaMemcpyPeer(dst, dstDevice, src, srcDevice, count));
+}
+
+inline static hipError_t hipMemcpyPeerAsync(void* dst, int dstDevice, const void* src,
+                                            int srcDevice, size_t count,
+                                            hipStream_t stream __dparm(0)) {
+    return hipCUDAErrorTohipError(
+        cudaMemcpyPeerAsync(dst, dstDevice, src, srcDevice, count, stream));
+}
+
+// Profile APIs:
+inline static hipError_t hipProfilerStart() { return hipCUDAErrorTohipError(cudaProfilerStart()); }
+
+inline static hipError_t hipProfilerStop() { return hipCUDAErrorTohipError(cudaProfilerStop()); }
+
+inline static hipError_t hipGetDeviceFlags(unsigned int* flags) {
+    return hipCUDAErrorTohipError(cudaGetDeviceFlags(flags));
+}
+
+inline static hipError_t hipSetDeviceFlags(unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaSetDeviceFlags(flags));
+}
+
+inline static hipError_t hipEventCreateWithFlags(hipEvent_t* event, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaEventCreateWithFlags(event, flags));
+}
+
+inline static hipError_t hipEventQuery(hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaEventQuery(event));
+}
+
+inline static hipError_t hipCtxCreate(hipCtx_t* ctx, unsigned int flags, hipDevice_t device) {
+    return hipCUResultTohipError(cuCtxCreate(ctx, flags, device));
+}
+
+inline static hipError_t hipCtxDestroy(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxDestroy(ctx));
+}
+
+inline static hipError_t hipCtxPopCurrent(hipCtx_t* ctx) {
+    return hipCUResultTohipError(cuCtxPopCurrent(ctx));
+}
+
+inline static hipError_t hipCtxPushCurrent(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxPushCurrent(ctx));
+}
+
+inline static hipError_t hipCtxSetCurrent(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxSetCurrent(ctx));
+}
+
+inline static hipError_t hipCtxGetCurrent(hipCtx_t* ctx) {
+    return hipCUResultTohipError(cuCtxGetCurrent(ctx));
+}
+
+inline static hipError_t hipCtxGetDevice(hipDevice_t* device) {
+    return hipCUResultTohipError(cuCtxGetDevice(device));
+}
+
+inline static hipError_t hipCtxGetApiVersion(hipCtx_t ctx, int* apiVersion) {
+    return hipCUResultTohipError(cuCtxGetApiVersion(ctx, (unsigned int*)apiVersion));
+}
+
+inline static hipError_t hipCtxGetCacheConfig(hipFuncCache* cacheConfig) {
+    return hipCUResultTohipError(cuCtxGetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipCtxSetCacheConfig(hipFuncCache cacheConfig) {
+    return hipCUResultTohipError(cuCtxSetCacheConfig(cacheConfig));
+}
+
+inline static hipError_t hipCtxSetSharedMemConfig(hipSharedMemConfig config) {
+    return hipCUResultTohipError(cuCtxSetSharedMemConfig((CUsharedconfig)config));
+}
+
+inline static hipError_t hipCtxGetSharedMemConfig(hipSharedMemConfig* pConfig) {
+    return hipCUResultTohipError(cuCtxGetSharedMemConfig((CUsharedconfig*)pConfig));
+}
+
+inline static hipError_t hipCtxSynchronize(void) {
+    return hipCUResultTohipError(cuCtxSynchronize());
+}
+
+inline static hipError_t hipCtxGetFlags(unsigned int* flags) {
+    return hipCUResultTohipError(cuCtxGetFlags(flags));
+}
+
+inline static hipError_t hipCtxDetach(hipCtx_t ctx) {
+    return hipCUResultTohipError(cuCtxDetach(ctx));
+}
+
+inline static hipError_t hipDeviceGet(hipDevice_t* device, int ordinal) {
+    return hipCUResultTohipError(cuDeviceGet(device, ordinal));
+}
+
+inline static hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device) {
+    return hipCUResultTohipError(cuDeviceComputeCapability(major, minor, device));
+}
+
+inline static hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device) {
+    return hipCUResultTohipError(cuDeviceGetName(name, len, device));
+}
+
+inline static hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
+                                                  int srcDevice, int dstDevice) {
+    return hipCUDAErrorTohipError(cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice));
+}
+
+inline static hipError_t hipDeviceGetPCIBusId(char* pciBusId, int len, hipDevice_t device) {
+    return hipCUDAErrorTohipError(cudaDeviceGetPCIBusId(pciBusId, len, device));
+}
+
+inline static hipError_t hipDeviceGetByPCIBusId(int* device, const char* pciBusId) {
+    return hipCUDAErrorTohipError(cudaDeviceGetByPCIBusId(device, pciBusId));
+}
+
+inline static hipError_t hipDeviceGetSharedMemConfig(hipSharedMemConfig* config) {
+    return hipCUDAErrorTohipError(cudaDeviceGetSharedMemConfig(config));
+}
+
+inline static hipError_t hipDeviceSetSharedMemConfig(hipSharedMemConfig config) {
+    return hipCUDAErrorTohipError(cudaDeviceSetSharedMemConfig(config));
+}
+
+inline static hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
+    return hipCUDAErrorTohipError(cudaDeviceGetLimit(pValue, limit));
+}
+
+inline static hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device) {
+    return hipCUResultTohipError(cuDeviceTotalMem(bytes, device));
+}
+
+inline static hipError_t hipModuleLoad(hipModule_t* module, const char* fname) {
+    return hipCUResultTohipError(cuModuleLoad(module, fname));
+}
+
+inline static hipError_t hipModuleUnload(hipModule_t hmod) {
+    return hipCUResultTohipError(cuModuleUnload(hmod));
+}
+
+inline static hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule_t module,
+                                              const char* kname) {
+    return hipCUResultTohipError(cuModuleGetFunction(function, module, kname));
+}
+
+inline static hipError_t hipModuleGetTexRef(hipTexRef* pTexRef, hipModule_t hmod, const char* name){
+    hipCUResultTohipError(cuModuleGetTexRef(pTexRef, hmod, name));
+}
+
+inline static hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) {
+    return hipCUDAErrorTohipError(cudaFuncGetAttributes(attr, func));
+}
+
+inline static hipError_t hipFuncGetAttribute (int* value, hipFunction_attribute attrib, hipFunction_t hfunc) {
+    return hipCUResultTohipError(cuFuncGetAttribute(value, attrib, hfunc));
+}
+
+inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes, hipModule_t hmod,
+                                            const char* name) {
+    return hipCUResultTohipError(cuModuleGetGlobal(dptr, bytes, hmod, name));
+}
+
+inline static hipError_t hipModuleLoadData(hipModule_t* module, const void* image) {
+    return hipCUResultTohipError(cuModuleLoadData(module, image));
+}
+
+inline static hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image,
+                                             unsigned int numOptions, hipJitOption* options,
+                                             void** optionValues) {
+    return hipCUResultTohipError(
+        cuModuleLoadDataEx(module, image, numOptions, options, optionValues));
+}
+
+inline static hipError_t hipLaunchKernel(const void* function_address, dim3 numBlocks,
+					 dim3 dimBlocks, void** args, size_t sharedMemBytes,
+					 hipStream_t stream)
+{
+   return hipCUDAErrorTohipError(cudaLaunchKernel(function_address,numBlocks,dimBlocks,args,sharedMemBytes,stream));
+}
+
+inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
+                                               unsigned int gridDimY, unsigned int gridDimZ,
+                                               unsigned int blockDimX, unsigned int blockDimY,
+                                               unsigned int blockDimZ, unsigned int sharedMemBytes,
+                                               hipStream_t stream, void** kernelParams,
+                                               void** extra) {
+    return hipCUResultTohipError(cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX,
+                                                blockDimY, blockDimZ, sharedMemBytes, stream,
+                                                kernelParams, extra));
+}
+
+inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t cacheConfig) {
+    return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset,
+                                                         struct textureReference* tex,
+                                                         const void* devPtr,
+                                                         const hipChannelFormatDesc* desc,
+                                                         size_t size __dparm(UINT_MAX)) {
+    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipBindTexture2D(
+    size_t* offset, struct textureReference* tex, const void* devPtr,
+    const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) {
+    return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
+}
+
+inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
+                                                        hipChannelFormatKind f) {
+    return cudaCreateChannelDesc(x, y, z, w, hipChannelFormatKindToCudaChannelFormatKind(f));
+}
+
+inline static hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
+                                                const hipResourceDesc* pResDesc,
+                                                const hipTextureDesc* pTexDesc,
+                                                const hipResourceViewDesc* pResViewDesc) {
+    return hipCUDAErrorTohipError(
+        cudaCreateTextureObject(pTexObject, pResDesc, pTexDesc, pResViewDesc));
+}
+
+inline static hipError_t hipDestroyTextureObject(hipTextureObject_t textureObject) {
+    return hipCUDAErrorTohipError(cudaDestroyTextureObject(textureObject));
+}
+
+inline static hipError_t hipCreateSurfaceObject(hipSurfaceObject_t* pSurfObject,
+                                                const hipResourceDesc* pResDesc) {
+    return hipCUDAErrorTohipError(cudaCreateSurfaceObject(pSurfObject, pResDesc));
+}
+
+inline static hipError_t hipDestroySurfaceObject(hipSurfaceObject_t surfaceObject) {
+    return hipCUDAErrorTohipError(cudaDestroySurfaceObject(surfaceObject));
+}
+
+inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDesc,
+                                           hipTextureObject_t textureObject) {
+    return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
+    size_t* offset, const struct textureReference* texref) {
+    return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));
+}
+
+inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
+{
+    return hipCUDAErrorTohipError(cudaGetChannelDesc(desc,array));
+}
+
+inline static hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, dim3 blockDim,
+                                      void** kernelParams, unsigned int sharedMemBytes,
+                                      hipStream_t stream) {
+    return hipCUDAErrorTohipError(
+            cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
+}
+
+inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
+                                                 int  numDevices, unsigned int  flags) {
+    return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#ifdef __CUDACC__
+
+template<class T>
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
+                                                                      T func,
+                                                                      int blockSize,
+                                                                      size_t dynamicSMemSize) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, func,
+                                                            blockSize, dynamicSMemSize));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int* blockSize, T func,
+                                                           size_t dynamicSMemSize = 0,
+                                                           int blockSizeLimit = 0) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func,
+                                                           dynamicSMemSize, blockSizeLimit));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* minGridSize, int* blockSize, T func,
+                                                           size_t dynamicSMemSize = 0,
+                                                           int blockSizeLimit = 0, unsigned int  flags = 0) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSize(minGridSize, blockSize, func,
+                                                           dynamicSMemSize, blockSizeLimit, flags));
+}
+
+template <class T>
+inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( int* numBlocks, T func,
+                                              int  blockSize, size_t dynamicSMemSize,unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, func,
+                                                                 blockSize, dynamicSMemSize, flags));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+inline static hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
+                                        const void* devPtr, size_t size = UINT_MAX) {
+    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, size));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+inline static hipError_t hipBindTexture(size_t* offset, struct texture<T, dim, readMode>& tex,
+                                        const void* devPtr, const hipChannelFormatDesc& desc,
+                                        size_t size = UINT_MAX) {
+    return hipCUDAErrorTohipError(cudaBindTexture(offset, tex, devPtr, desc, size));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>* tex) {
+    return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipUnbindTexture(struct texture<T, dim, readMode>& tex) {
+    return hipCUDAErrorTohipError(cudaUnbindTexture(tex));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>& tex, hipArray_const_t array,
+    const hipChannelFormatDesc& desc) {
+    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>* tex, hipArray_const_t array,
+    const hipChannelFormatDesc* desc) {
+    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array, desc));
+}
+
+template <class T, int dim, enum cudaTextureReadMode readMode>
+__HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
+    struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
+    return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array));
+}
+
+template <class T>
+inline static hipChannelFormatDesc hipCreateChannelDesc() {
+    return cudaCreateChannelDesc<T>();
+}
+
+template <class T>
+inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim,
+                                             void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) {
+    return hipCUDAErrorTohipError(
+            cudaLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim, blockDim, kernelParams, sharedMemBytes, stream));
+}
+
+inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipAddress_mode am){
+    return hipCUResultTohipError(cuTexRefSetAddressMode(hTexRef,dim,am));
+}
+
+inline static hipError_t hipTexRefSetFilterMode(hipTexRef hTexRef, hipFilter_mode fm){
+    return hipCUResultTohipError(cuTexRefSetFilterMode(hTexRef,fm));
+}
+
+inline static hipError_t hipTexRefSetAddress(size_t *ByteOffset, hipTexRef hTexRef, hipDeviceptr_t dptr, size_t bytes){
+   return hipCUResultTohipError(cuTexRefSetAddress(ByteOffset,hTexRef,dptr,bytes));
+}
+
+inline static hipError_t hipTexRefSetAddress2D(hipTexRef hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, hipDeviceptr_t dptr, size_t Pitch){
+   return hipCUResultTohipError(cuTexRefSetAddress2D(hTexRef,desc,dptr,Pitch));
+}
+
+inline static hipError_t hipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents){
+   return hipCUResultTohipError(cuTexRefSetFormat(hTexRef,fmt,NumPackedComponents));
+}
+
+inline static hipError_t hipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags){
+   return hipCUResultTohipError(cuTexRefSetFlags(hTexRef,Flags));
+}
+
+inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, hiparray hArray, unsigned int Flags){
+   return hipCUResultTohipError(cuTexRefSetArray(hTexRef,hArray,Flags));
+}
+
+inline static hipError_t hipArrayCreate(hiparray* pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray){
+   return hipCUResultTohipError(cuArrayCreate(pHandle, pAllocateArray));
+}
+
+inline static hipError_t hipArrayDestroy(hiparray hArray){
+   return hipCUResultTohipError(cuArrayDestroy(hArray));
+}
+
+inline static hipError_t hipArray3DCreate(hiparray* pHandle,
+                                          const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray){
+   return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
+}
+
+#endif  //__CUDACC__
+
+#endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_texture_types.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_texture_types.h
new file mode 100644
index 0000000000..df374d705a
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_texture_types.h
@@ -0,0 +1,6 @@
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_TEXTURE_TYPES_H
+
+#include <texture_types.h>
+
+#endif
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h b/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
new file mode 100644
index 0000000000..449ba26c0f
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
@@ -0,0 +1,168 @@
+/*
+Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef HIPRTC_H
+#define HIPRTC_H
+
+#include <cuda.h>
+#include <nvrtc.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>
+
+#if !defined(_WIN32)
+#pragma GCC visibility push(default)
+#endif
+
+typedef enum hiprtcResult {
+  HIPRTC_SUCCESS = 0,
+  HIPRTC_ERROR_OUT_OF_MEMORY = 1,
+  HIPRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  HIPRTC_ERROR_INVALID_INPUT = 3,
+  HIPRTC_ERROR_INVALID_PROGRAM = 4,
+  HIPRTC_ERROR_INVALID_OPTION = 5,
+  HIPRTC_ERROR_COMPILATION = 6,
+  HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  HIPRTC_ERROR_INTERNAL_ERROR = 11
+} hiprtcResult;
+
+inline static nvrtcResult hiprtcResultTonvrtcResult(hiprtcResult result) {
+  switch (result) {
+    case HIPRTC_SUCCESS:
+      return NVRTC_SUCCESS;
+    case HIPRTC_ERROR_OUT_OF_MEMORY:
+      return NVRTC_ERROR_OUT_OF_MEMORY;
+    case HIPRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      return NVRTC_ERROR_PROGRAM_CREATION_FAILURE;
+    case HIPRTC_ERROR_INVALID_INPUT:
+      return NVRTC_ERROR_INVALID_INPUT;
+    case HIPRTC_ERROR_INVALID_PROGRAM:
+      return NVRTC_ERROR_INVALID_PROGRAM;
+    case HIPRTC_ERROR_INVALID_OPTION:
+      return NVRTC_ERROR_INVALID_OPTION;
+    case HIPRTC_ERROR_COMPILATION:
+      return NVRTC_ERROR_COMPILATION;
+    case HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      return NVRTC_ERROR_BUILTIN_OPERATION_FAILURE;
+    case HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      return NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
+    case HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      return NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
+    case HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      return NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+    case HIPRTC_ERROR_INTERNAL_ERROR:
+      return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+}
+
+inline static hiprtcResult nvrtcResultTohiprtcResult(nvrtcResult result) {
+  switch (result) {
+    case NVRTC_SUCCESS:
+      return HIPRTC_SUCCESS;
+    case NVRTC_ERROR_OUT_OF_MEMORY:
+      return HIPRTC_ERROR_OUT_OF_MEMORY;
+    case NVRTC_ERROR_PROGRAM_CREATION_FAILURE:
+      return HIPRTC_ERROR_PROGRAM_CREATION_FAILURE;
+    case NVRTC_ERROR_INVALID_INPUT:
+      return HIPRTC_ERROR_INVALID_INPUT;
+    case NVRTC_ERROR_INVALID_PROGRAM:
+      return HIPRTC_ERROR_INVALID_PROGRAM;
+    case NVRTC_ERROR_INVALID_OPTION:
+      return HIPRTC_ERROR_INVALID_OPTION;
+    case NVRTC_ERROR_COMPILATION:
+      return HIPRTC_ERROR_COMPILATION;
+    case NVRTC_ERROR_BUILTIN_OPERATION_FAILURE:
+      return HIPRTC_ERROR_BUILTIN_OPERATION_FAILURE;
+    case NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION:
+      return HIPRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION;
+    case NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION:
+      return HIPRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION;
+    case NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID:
+      return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
+    case NVRTC_ERROR_INTERNAL_ERROR:
+      return HIPRTC_ERROR_INTERNAL_ERROR;
+  }
+}
+
+const char* hiprtcGetErrorString(hiprtcResult result) {
+  return nvrtcGetErrorString(hiprtcResultTonvrtcResult(result));
+}
+
+hiprtcResult hiprtcVersion(int* major, int* minor) {
+  return nvrtcResultTohiprtcResult(nvrtcVersion(major, minor));
+}
+
+typedef nvrtcProgram hiprtcProgram;
+
+hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression) {
+  return nvrtcResultTohiprtcResult(nvrtcAddNameExpression(prog, name_expression));
+}
+
+hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions, const char** options) {
+  return nvrtcResultTohiprtcResult(nvrtcCompileProgram(prog, numOptions, options));
+}
+
+hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src, const char* name,
+                                 int numHeaders, const char** headers, const char** includeNames) {
+  return nvrtcResultTohiprtcResult(
+      nvrtcCreateProgram(prog, src, name, numHeaders, headers, includeNames));
+}
+
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog) {
+  return nvrtcResultTohiprtcResult(nvrtcDestroyProgram(prog));
+}
+
+hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog, const char* name_expression,
+                                  const char** lowered_name) {
+  return nvrtcResultTohiprtcResult(nvrtcGetLoweredName(prog, name_expression, lowered_name));
+}
+
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log) {
+  return nvrtcResultTohiprtcResult(nvrtcGetProgramLog(prog, log));
+}
+
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet) {
+  return nvrtcResultTohiprtcResult(nvrtcGetProgramLogSize(prog, logSizeRet));
+}
+
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code) {
+  return nvrtcResultTohiprtcResult(nvrtcGetPTX(prog, code));
+}
+
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet) {
+  return nvrtcResultTohiprtcResult(nvrtcGetPTXSize(prog, codeSizeRet));
+}
+
+#if !defined(_WIN32)
+#pragma GCC visibility pop
+#endif
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+#endif  // HIPRTC_H

From ef4d47916976254abfcb34add6d5aa4af6f6fdd0 Mon Sep 17 00:00:00 2001
From: Sarbojit Sarkar <Sarbojit.Sarkar@amd.com>
Date: Wed, 16 Jun 2021 10:16:18 +0000
Subject: [PATCH 013/177] SWDEV-284435 - StreamOps mapping with cuda drv API

Change-Id: I6ed9196cfe59d9004338206f6dd8d6fb367b3ffa
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 66e4743abd..5d1813210e 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -81,6 +81,10 @@ typedef enum hipMemoryAdvise {
 #define HIP_C_32F CUDA_C_32F
 #define HIP_C_64F CUDA_C_64F
 
+// hip stream operation masks
+#define STREAM_OPS_WAIT_MASK_32 0xFFFFFFFF
+#define STREAM_OPS_WAIT_MASK_64 0xFFFFFFFFFFFFFFFF
+
 // hipLibraryPropertyType
 #define hipLibraryPropertyType libraryPropertyType
 #define HIP_LIBRARY_MAJOR_VERSION MAJOR_VERSION
@@ -2190,6 +2194,50 @@ inline static hipError_t hipArray3DCreate(hiparray* pHandle,
    return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
 }
 
+inline static hipError_t hipStreamWriteValue32(hipStream_t stream,
+                                               void* ptr, int32_t value, unsigned int flags) {
+   if (value < 0) {
+     printf("Warning! value is negative, CUDA accept positive values\n");
+   }
+   return hipCUResultTohipError(cuStreamWriteValue32(stream, reinterpret_cast<CUdeviceptr>(ptr),
+                                                             reinterpret_cast<cuuint32_t>(value),flags));
+}
+
+inline static hipError_t hipStreamWriteValue64(hipStream_t stream,
+                                               void* ptr, int64_t value, unsigned int flags) {
+   if (value < 0) {
+     printf("Warning! value is negative, CUDA accept positive values\n");
+   }
+   return hipCUResultTohipError(cuStreamWriteValue64(stream, reinterpret_cast<CUdeviceptr>(ptr),
+                                                    reinterpret_cast<cuuint64_t>(value),flags));
+}
+
+inline static hipError_t hipStreamWaitValue32(hipStream_t stream,
+                                              void* ptr, int32_t value, unsigned int flags,
+                                              uint32_t mask)) {
+   if (value < 0) {
+     printf("Warning! value is negative, CUDA accept positive values\n");
+   }
+   if (mask != STREAM_OPS_WAIT_MASK_32) {
+     printf("Warning! mask will not have impact as CUDA ignores it.\n");
+   }
+   return hipCUResultTohipError(cuStreamWaitValue32(stream, reinterpret_cast<CUdeviceptr>(ptr),
+                                                    reinterpret_cast<cuuint32_t>(value),flags));
+}
+
+inline static hipError_t hipStreamWaitValue64(hipStream_t stream,
+                                              void* ptr, int64_t value, unsigned int flags,
+                                              uint64_t mask)) {
+   if (value < 0) {
+     printf("Warning! value is negative, CUDA accept positive values\n");
+   }
+   if (mask != STREAM_OPS_WAIT_MASK_64) {
+     printf("Warning! mask will not have impact as CUDA ignores it.\n");
+   }
+   return hipCUResultTohipError(cuStreamWaitValue64(stream, reinterpret_cast<CUdeviceptr>(ptr),
+                                                    reinterpret_cast<cuuint64_t>(value),flags));
+}
+
 #endif  //__CUDACC__
 
 #endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H

From c0a0999d1aacca845a074e913fe6bc40edc83ab4 Mon Sep 17 00:00:00 2001
From: Jatin <Jatin.Chaudhary@amd.com>
Date: Sat, 26 Jun 2021 13:31:10 +0530
Subject: [PATCH 014/177] SWDEV-245414 - fix nvcc CI breakage

Change-Id: I34ff533f34cfc2f60abefc0d8f6c6d8266d30f61
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h      | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 5d1813210e..650236e64a 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -27,6 +27,7 @@ THE SOFTWARE.
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_fp16.h>
+#include <stdio.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -2200,7 +2201,7 @@ inline static hipError_t hipStreamWriteValue32(hipStream_t stream,
      printf("Warning! value is negative, CUDA accept positive values\n");
    }
    return hipCUResultTohipError(cuStreamWriteValue32(stream, reinterpret_cast<CUdeviceptr>(ptr),
-                                                             reinterpret_cast<cuuint32_t>(value),flags));
+                                                             static_cast<cuuint32_t>(value), flags));
 }
 
 inline static hipError_t hipStreamWriteValue64(hipStream_t stream,
@@ -2209,12 +2210,12 @@ inline static hipError_t hipStreamWriteValue64(hipStream_t stream,
      printf("Warning! value is negative, CUDA accept positive values\n");
    }
    return hipCUResultTohipError(cuStreamWriteValue64(stream, reinterpret_cast<CUdeviceptr>(ptr),
-                                                    reinterpret_cast<cuuint64_t>(value),flags));
+                                                    static_cast<cuuint64_t>(value), flags));
 }
 
 inline static hipError_t hipStreamWaitValue32(hipStream_t stream,
                                               void* ptr, int32_t value, unsigned int flags,
-                                              uint32_t mask)) {
+                                              uint32_t mask) {
    if (value < 0) {
      printf("Warning! value is negative, CUDA accept positive values\n");
    }
@@ -2222,12 +2223,12 @@ inline static hipError_t hipStreamWaitValue32(hipStream_t stream,
      printf("Warning! mask will not have impact as CUDA ignores it.\n");
    }
    return hipCUResultTohipError(cuStreamWaitValue32(stream, reinterpret_cast<CUdeviceptr>(ptr),
-                                                    reinterpret_cast<cuuint32_t>(value),flags));
+                                                    static_cast<cuuint32_t>(value), flags));
 }
 
 inline static hipError_t hipStreamWaitValue64(hipStream_t stream,
                                               void* ptr, int64_t value, unsigned int flags,
-                                              uint64_t mask)) {
+                                              uint64_t mask) {
    if (value < 0) {
      printf("Warning! value is negative, CUDA accept positive values\n");
    }
@@ -2235,7 +2236,7 @@ inline static hipError_t hipStreamWaitValue64(hipStream_t stream,
      printf("Warning! mask will not have impact as CUDA ignores it.\n");
    }
    return hipCUResultTohipError(cuStreamWaitValue64(stream, reinterpret_cast<CUdeviceptr>(ptr),
-                                                    reinterpret_cast<cuuint64_t>(value),flags));
+                                                    static_cast<cuuint64_t>(value), flags));
 }
 
 #endif  //__CUDACC__

From d931c0eff900ed39c67fa5e48cc60ff2e6e058f5 Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <Anusha.GodavarthySurya@amd.com>
Date: Wed, 23 Jun 2021 06:54:20 -0700
Subject: [PATCH 015/177] SWDEV-240806 - hipGraph support for nvidia path

Change-Id: Idb51b3ed7ca65474afac0dc714c9097294d46bd2
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 199 +++++++++++++-----
 1 file changed, 148 insertions(+), 51 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 650236e64a..dc5900bf48 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -56,13 +56,12 @@ extern "C" {
  * Memory copy types
  *
  */
-typedef enum hipMemcpyKind {
-    hipMemcpyHostToHost,
-    hipMemcpyHostToDevice,
-    hipMemcpyDeviceToHost,
-    hipMemcpyDeviceToDevice,
-    hipMemcpyDefault
-} hipMemcpyKind;
+typedef enum cudaMemcpyKind hipMemcpyKind;
+#define hipMemcpyHostToHost cudaMemcpyHostToHost
+#define hipMemcpyHostToDevice cudaMemcpyHostToDevice
+#define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define hipMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define hipMemcpyDefault cudaMemcpyDefault
 
 typedef enum hipMemoryAdvise {
     hipMemAdviseSetReadMostly,
@@ -906,6 +905,54 @@ inline static enum cudaChannelFormatKind hipChannelFormatKindToCudaChannelFormat
     }
 }
 
+/**
+ * graph types
+ *
+ */
+typedef cudaGraph_t hipGraph_t;
+typedef cudaGraphNode_t hipGraphNode_t;
+typedef cudaGraphExec_t hipGraphExec_t;
+
+typedef enum cudaGraphNodeType hipGraphNodeType;
+#define hipGraphNodeTypeKernel cudaGraphNodeTypeKernel
+#define hipGraphNodeTypeMemcpy cudaGraphNodeTypeMemcpy
+#define hipGraphNodeTypeMemset cudaGraphNodeTypeMemset
+#define hipGraphNodeTypeHost cudaGraphNodeTypeHost
+#define hipGraphNodeTypeGraph cudaGraphNodeTypeGraph
+#define hipGraphNodeTypeEmpty cudaGraphNodeTypeEmpty
+#define hipGraphNodeTypeWaitEvent cudaGraphNodeTypeWaitEvent
+#define hipGraphNodeTypeEventRecord cudaGraphNodeTypeEventRecord
+#define hipGraphNodeTypeMemcpy1D cudaGraphNodeTypeMemcpy1D
+#define hipGraphNodeTypeMemcpyFromSymbol cudaGraphNodeTypeMemcpyFromSymbol
+#define hipGraphNodeTypeMemcpyToSymbol cudaGraphNodeTypeMemcpyToSymbol
+#define hipGraphNodeTypeCount cudaGraphNodeTypeCount
+
+typedef cudaHostFn_t hipHostFn_t;
+typedef struct cudaHostNodeParams hipHostNodeParams;
+typedef struct cudaKernelNodeParams hipKernelNodeParams;
+typedef struct cudaMemsetParams hipMemsetParams;
+
+typedef enum cudaGraphExecUpdateResult hipGraphExecUpdateResult;
+#define hipGraphExecUpdateSuccess cudaGraphExecUpdateSuccess
+#define hipGraphExecUpdateError cudaGraphExecUpdateError
+#define hipGraphExecUpdateErrorTopologyChanged cudaGraphExecUpdateErrorTopologyChanged
+#define hipGraphExecUpdateErrorNodeTypeChanged cudaGraphExecUpdateErrorNodeTypeChanged
+#define hipGraphExecUpdateErrorFunctionChanged cudaGraphExecUpdateErrorFunctionChanged
+#define hipGraphExecUpdateErrorParametersChanged cudaGraphExecUpdateErrorParametersChanged
+#define hipGraphExecUpdateErrorNotSupported cudaGraphExecUpdateErrorNotSupported
+#define hipGraphExecUpdateErrorUnsupportedFunctionChange                                           \
+  cudaGraphExecUpdateErrorUnsupportedFunctionChange
+
+typedef enum cudaStreamCaptureMode hipStreamCaptureMode;
+#define hipStreamCaptureModeGlobal cudaStreamCaptureModeGlobal
+#define hipStreamCaptureModeThreadLocal cudaStreamCaptureModeThreadLocal
+#define hipStreamCaptureModeRelaxed cudaStreamCaptureModeRelaxed
+
+typedef enum cudaStreamCaptureStatus hipStreamCaptureStatus;
+#define hipStreamCaptureStatusNone cudaStreamCaptureStatusNone
+#define hipStreamCaptureStatusActive cudaStreamCaptureStatusActive
+#define hipStreamCaptureStatusInvalidated cudaStreamCaptureStatusInvalidated
+
 /**
  * Stream CallBack struct
  */
@@ -1096,33 +1143,29 @@ inline static hipError_t hipMemcpyDtoDAsync(hipDeviceptr_t dst, hipDeviceptr_t s
 inline static hipError_t hipMemcpy(void* dst, const void* src, size_t sizeBytes,
                                    hipMemcpyKind copyKind) {
     return hipCUDAErrorTohipError(
-        cudaMemcpy(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind)));
+        cudaMemcpy(dst, src, sizeBytes, copyKind));
 }
 
 
-inline static hipError_t hipMemcpyWithStream(void* dst, const void* src,
-				      size_t sizeBytes, hipMemcpyKind copyKind,
-				      hipStream_t stream) {
-	cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes, 
-										hipMemcpyKindToCudaMemcpyKind(copyKind),
-										stream);
-	
-	if (error != cudaSuccess) return hipCUDAErrorTohipError(error);
-	
-	return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
+inline static hipError_t hipMemcpyWithStream(void* dst, const void* src, size_t sizeBytes,
+                                             hipMemcpyKind copyKind, hipStream_t stream) {
+    cudaError_t error = cudaMemcpyAsync(dst, src, sizeBytes, copyKind, stream);
+
+    if (error != cudaSuccess) return hipCUDAErrorTohipError(error);
+
+    return hipCUDAErrorTohipError(cudaStreamSynchronize(stream));
 }
 
 inline static hipError_t hipMemcpyAsync(void* dst, const void* src, size_t sizeBytes,
                                         hipMemcpyKind copyKind, hipStream_t stream __dparm(0)) {
     return hipCUDAErrorTohipError(
-        cudaMemcpyAsync(dst, src, sizeBytes, hipMemcpyKindToCudaMemcpyKind(copyKind), stream));
+        cudaMemcpyAsync(dst, src, sizeBytes, copyKind, stream));
 }
 
-inline static hipError_t hipMemcpyToSymbol(const void* symbol, const void* src, size_t sizeBytes,
-                                           size_t offset __dparm(0),
-                                           hipMemcpyKind copyType __dparm(hipMemcpyHostToDevice)) {
-    return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset,
-                                                     hipMemcpyKindToCudaMemcpyKind(copyType)));
+inline static hipError_t hipMemcpyToSymbol(
+    const void* symbol, const void* src, size_t sizeBytes, size_t offset __dparm(0),
+    hipMemcpyKind copyType __dparm(hipMemcpyKindToCudaMemcpyKind(hipMemcpyHostToDevice))) {
+    return hipCUDAErrorTohipError(cudaMemcpyToSymbol(symbol, src, sizeBytes, offset, copyType));
 }
 
 inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void* src,
@@ -1130,14 +1173,13 @@ inline static hipError_t hipMemcpyToSymbolAsync(const void* symbol, const void*
                                                 hipMemcpyKind copyType,
                                                 hipStream_t stream __dparm(0)) {
     return hipCUDAErrorTohipError(cudaMemcpyToSymbolAsync(
-        symbol, src, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(copyType), stream));
+        symbol, src, sizeBytes, offset, copyType, stream));
 }
 
-inline static hipError_t hipMemcpyFromSymbol(void* dst, const void* symbolName, size_t sizeBytes,
-                                             size_t offset __dparm(0),
-                                             hipMemcpyKind kind __dparm(hipMemcpyDeviceToHost)) {
-    return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset,
-                                                       hipMemcpyKindToCudaMemcpyKind(kind)));
+inline static hipError_t hipMemcpyFromSymbol(
+    void* dst, const void* symbolName, size_t sizeBytes, size_t offset __dparm(0),
+    hipMemcpyKind kind __dparm(hipMemcpyKindToCudaMemcpyKind(hipMemcpyDeviceToHost))) {
+    return hipCUDAErrorTohipError(cudaMemcpyFromSymbol(dst, symbolName, sizeBytes, offset, kind));
 }
 
 inline static hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolName,
@@ -1145,7 +1187,7 @@ inline static hipError_t hipMemcpyFromSymbolAsync(void* dst, const void* symbolN
                                                   hipMemcpyKind kind,
                                                   hipStream_t stream __dparm(0)) {
     return hipCUDAErrorTohipError(cudaMemcpyFromSymbolAsync(
-        dst, symbolName, sizeBytes, offset, hipMemcpyKindToCudaMemcpyKind(kind), stream));
+        dst, symbolName, sizeBytes, offset, kind, stream));
 }
 
 inline static hipError_t hipGetSymbolAddress(void** devPtr, const void* symbolName) {
@@ -1159,7 +1201,7 @@ inline static hipError_t hipGetSymbolSize(size_t* size, const void* symbolName)
 inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src, size_t spitch,
                                      size_t width, size_t height, hipMemcpyKind kind) {
     return hipCUDAErrorTohipError(
-        cudaMemcpy2D(dst, dpitch, src, spitch, width, height, hipMemcpyKindToCudaMemcpyKind(kind)));
+        cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind));
 }
 
 inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
@@ -1190,7 +1232,7 @@ inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void*
                                           size_t width, size_t height, hipMemcpyKind kind,
                                           hipStream_t stream) {
     return hipCUDAErrorTohipError(cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height,
-                                                    hipMemcpyKindToCudaMemcpyKind(kind), stream));
+                                                    kind, stream));
 }
 
 inline static hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray* src,
@@ -1198,7 +1240,7 @@ inline static hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray
                                               size_t height, hipMemcpyKind kind) {
     return hipCUDAErrorTohipError(cudaMemcpy2DFromArray(dst, dpitch, src, wOffset, hOffset, width,
                                                         height,
-                                                        hipMemcpyKindToCudaMemcpyKind(kind)));
+                                                        kind));
 }
 
 inline static hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray* src,
@@ -1207,7 +1249,7 @@ inline static hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hip
                                                    hipStream_t stream) {
     return hipCUDAErrorTohipError(cudaMemcpy2DFromArrayAsync(dst, dpitch, src, wOffset, hOffset,
                                                              width, height,
-                                                             hipMemcpyKindToCudaMemcpyKind(kind),
+                                                             kind,
                                                              stream));
 }
 
@@ -1215,7 +1257,7 @@ inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_
                                             const void* src, size_t spitch, size_t width,
                                             size_t height, hipMemcpyKind kind) {
     return hipCUDAErrorTohipError(cudaMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width,
-                                                      height, hipMemcpyKindToCudaMemcpyKind(kind)));
+                                                      height, kind));
 }
 
 inline static hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset,
@@ -1224,7 +1266,7 @@ inline static hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset,
                                                  hipStream_t stream) {
     return hipCUDAErrorTohipError(cudaMemcpy2DToArrayAsync(dst, wOffset, hOffset, src, spitch,
                                                            width, height,
-                                                           hipMemcpyKindToCudaMemcpyKind(kind),
+                                                           kind,
                                                            stream));
 }
 
@@ -1232,14 +1274,14 @@ __HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t
                                                            size_t hOffset, const void* src,
                                                            size_t count, hipMemcpyKind kind) {
     return hipCUDAErrorTohipError(
-        cudaMemcpyToArray(dst, wOffset, hOffset, src, count, hipMemcpyKindToCudaMemcpyKind(kind)));
+        cudaMemcpyToArray(dst, wOffset, hOffset, src, count, kind));
 }
 
 __HIP_DEPRECATED inline static hipError_t hipMemcpyFromArray(void* dst, hipArray_const_t srcArray,
                                                              size_t wOffset, size_t hOffset,
                                                              size_t count, hipMemcpyKind kind) {
     return hipCUDAErrorTohipError(cudaMemcpyFromArray(dst, srcArray, wOffset, hOffset, count,
-                                                      hipMemcpyKindToCudaMemcpyKind(kind)));
+                                                      kind));
 }
 
 inline static hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset,
@@ -1973,10 +2015,10 @@ inline static hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* im
 }
 
 inline static hipError_t hipLaunchKernel(const void* function_address, dim3 numBlocks,
-					 dim3 dimBlocks, void** args, size_t sharedMemBytes,
-					 hipStream_t stream)
-{
-   return hipCUDAErrorTohipError(cudaLaunchKernel(function_address,numBlocks,dimBlocks,args,sharedMemBytes,stream));
+                                         dim3 dimBlocks, void** args, size_t sharedMemBytes,
+                                         hipStream_t stream) {
+    return hipCUDAErrorTohipError(
+        cudaLaunchKernel(function_address, numBlocks, dimBlocks, args, sharedMemBytes, stream));
 }
 
 inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
@@ -2163,36 +2205,91 @@ inline static hipError_t hipTexRefSetFilterMode(hipTexRef hTexRef, hipFilter_mod
 }
 
 inline static hipError_t hipTexRefSetAddress(size_t *ByteOffset, hipTexRef hTexRef, hipDeviceptr_t dptr, size_t bytes){
-   return hipCUResultTohipError(cuTexRefSetAddress(ByteOffset,hTexRef,dptr,bytes));
+    return hipCUResultTohipError(cuTexRefSetAddress(ByteOffset,hTexRef,dptr,bytes));
 }
 
 inline static hipError_t hipTexRefSetAddress2D(hipTexRef hTexRef, const CUDA_ARRAY_DESCRIPTOR *desc, hipDeviceptr_t dptr, size_t Pitch){
-   return hipCUResultTohipError(cuTexRefSetAddress2D(hTexRef,desc,dptr,Pitch));
+    return hipCUResultTohipError(cuTexRefSetAddress2D(hTexRef,desc,dptr,Pitch));
 }
 
 inline static hipError_t hipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents){
-   return hipCUResultTohipError(cuTexRefSetFormat(hTexRef,fmt,NumPackedComponents));
+    return hipCUResultTohipError(cuTexRefSetFormat(hTexRef,fmt,NumPackedComponents));
 }
 
 inline static hipError_t hipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags){
-   return hipCUResultTohipError(cuTexRefSetFlags(hTexRef,Flags));
+    return hipCUResultTohipError(cuTexRefSetFlags(hTexRef,Flags));
 }
 
 inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, hiparray hArray, unsigned int Flags){
-   return hipCUResultTohipError(cuTexRefSetArray(hTexRef,hArray,Flags));
+    return hipCUResultTohipError(cuTexRefSetArray(hTexRef,hArray,Flags));
 }
 
 inline static hipError_t hipArrayCreate(hiparray* pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray){
-   return hipCUResultTohipError(cuArrayCreate(pHandle, pAllocateArray));
+    return hipCUResultTohipError(cuArrayCreate(pHandle, pAllocateArray));
 }
 
 inline static hipError_t hipArrayDestroy(hiparray hArray){
-   return hipCUResultTohipError(cuArrayDestroy(hArray));
+    return hipCUResultTohipError(cuArrayDestroy(hArray));
 }
 
 inline static hipError_t hipArray3DCreate(hiparray* pHandle,
                                           const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray){
-   return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
+    return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
+}
+
+inline static hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode) {
+    return hipCUDAErrorTohipError(cudaStreamBeginCapture(stream, mode));
+}
+
+inline static hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph) {
+    return hipCUDAErrorTohipError(cudaStreamEndCapture(stream, pGraph));
+}
+
+inline static hipError_t hipGraphCreate(hipGraph_t* pGraph, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaGraphCreate(pGraph, flags));
+}
+
+inline static hipError_t hipGraphDestroy(hipGraph_t graph) {
+    return hipCUDAErrorTohipError(cudaGraphDestroy(graph));
+}
+
+inline static hipError_t hipGraphExecDestroy(hipGraphExec_t pGraphExec) {
+    return hipCUDAErrorTohipError(cudaGraphExecDestroy(pGraphExec));
+}
+
+inline static hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGraph_t graph,
+                                             hipGraphNode_t* pErrorNode, char* pLogBuffer,
+                                             size_t bufferSize) {
+    return hipCUDAErrorTohipError(
+        cudaGraphInstantiate(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize));
+}
+
+inline static hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaGraphLaunch(graphExec, stream));
+}
+
+inline static hipError_t hipGraphAddKernelNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                               const hipGraphNode_t* pDependencies,
+                                               size_t numDependencies,
+                                               const hipKernelNodeParams* pNodeParams) {
+    return hipCUDAErrorTohipError(
+        cudaGraphAddKernelNode(pGraphNode, graph, pDependencies, numDependencies, pNodeParams));
+}
+
+inline static hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                               const hipGraphNode_t* pDependencies,
+                                               size_t numDependencies,
+                                               const hipMemcpy3DParms* pCopyParams) {
+    return hipCUDAErrorTohipError(
+        cudaGraphAddMemcpyNode(pGraphNode, graph, pDependencies, numDependencies, pCopyParams));
+}
+
+inline static hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                               const hipGraphNode_t* pDependencies,
+                                               size_t numDependencies,
+                                               const hipMemsetParams* pMemsetParams) {
+    return hipCUDAErrorTohipError(
+        cudaGraphAddMemsetNode(pGraphNode, graph, pDependencies, numDependencies, pMemsetParams));
 }
 
 inline static hipError_t hipStreamWriteValue32(hipStream_t stream,

From 171f5d5869cbd506ac7c4280c2bbfb1f5cc45f30 Mon Sep 17 00:00:00 2001
From: agunashe <ajay.gunashekar@amd.com>
Date: Fri, 2 Jul 2021 16:46:49 -0700
Subject: [PATCH 016/177] SWDEV-293742 - Update copyrights end year for hipamd

Change-Id: I08f620f84563a9214b59f1b943ed091b67229eab
---
 hipnv/include/hip/nvidia_detail/nvidia_channel_descriptor.h | 2 +-
 hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h        | 2 +-
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h        | 2 +-
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h    | 2 +-
 hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h             | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_channel_descriptor.h b/hipnv/include/hip/nvidia_detail/nvidia_channel_descriptor.h
index 7eb0e65fda..b5873be174 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_channel_descriptor.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_channel_descriptor.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h
index 10a53d1743..2e14e893a3 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
index a42fecc611..007fc70085 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index dc5900bf48..adddb2f484 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h b/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
index 449ba26c0f..449fe342af 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021 - present Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From 8b809eb24eb120fba2970166c1e57e5233381871 Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <Anusha.GodavarthySurya@amd.com>
Date: Wed, 7 Jul 2021 03:58:37 -0700
Subject: [PATCH 017/177] SWDEV-240807 - Added Implementation for few more
 graph APIs

Change-Id: I76336a22233a208a3f54ff9e90f0c5bf4a1bddb4
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index adddb2f484..1e1fbd13a4 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2292,6 +2292,64 @@ inline static hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGr
         cudaGraphAddMemsetNode(pGraphNode, graph, pDependencies, numDependencies, pMemsetParams));
 }
 
+inline static hipError_t hipGraphGetNodes(hipGraph_t graph, hipGraphNode_t* nodes,
+                                          size_t* numNodes) {
+    return hipCUDAErrorTohipError(cudaGraphGetNodes(graph, nodes, numNodes));
+}
+
+inline static hipError_t hipGraphGetRootNodes(hipGraph_t graph, hipGraphNode_t* pRootNodes,
+                                              size_t* pNumRootNodes) {
+    return hipCUDAErrorTohipError(cudaGraphGetRootNodes(graph, pRootNodes, pNumRootNodes));
+}
+
+inline static hipError_t hipGraphKernelNodeGetParams(hipGraphNode_t node,
+                                                     hipKernelNodeParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphKernelNodeGetParams(node, pNodeParams));
+}
+
+inline static hipError_t hipGraphKernelNodeSetParams(hipGraphNode_t node,
+                                                     const hipKernelNodeParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphKernelNodeSetParams(node, pNodeParams));
+}
+
+inline static hipError_t hipGraphMemcpyNodeGetParams(hipGraphNode_t node,
+                                                     hipMemcpy3DParms* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphMemcpyNodeGetParams(node, pNodeParams));
+}
+
+inline static hipError_t hipGraphMemcpyNodeSetParams(hipGraphNode_t node,
+                                                     const hipMemcpy3DParms* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphMemcpyNodeSetParams(node, pNodeParams));
+}
+
+inline static hipError_t hipGraphMemsetNodeGetParams(hipGraphNode_t node,
+                                                     hipMemsetParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphMemsetNodeGetParams(node, pNodeParams));
+}
+
+inline static hipError_t hipGraphMemsetNodeSetParams(hipGraphNode_t node,
+                                                     const hipMemsetParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphMemsetNodeSetParams(node, pNodeParams));
+}
+
+inline static hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec,
+                                                         hipGraphNode_t node,
+                                                         const hipKernelNodeParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphExecKernelNodeSetParams(hGraphExec, node, pNodeParams));
+}
+
+inline static hipError_t hipGraphAddDependencies(hipGraph_t graph, const hipGraphNode_t* from,
+                                                 const hipGraphNode_t* to, size_t numDependencies) {
+    return hipCUDAErrorTohipError(cudaGraphAddDependencies(graph, from, to, numDependencies));
+}
+
+inline static hipError_t hipGraphAddEmptyNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                              const hipGraphNode_t* pDependencies,
+                                              size_t numDependencies) {
+    return hipCUDAErrorTohipError(
+      cudaGraphAddEmptyNode(pGraphNode, graph, pDependencies, numDependencies));
+}
+
 inline static hipError_t hipStreamWriteValue32(hipStream_t stream,
                                                void* ptr, int32_t value, unsigned int flags) {
    if (value < 0) {

From b0bd8d6859aea712ced6795abdfd24f31c2b2bb1 Mon Sep 17 00:00:00 2001
From: Tao Sang <tao.sang@amd.com>
Date: Mon, 12 Jul 2021 22:55:40 -0400
Subject: [PATCH 018/177] SWDEV-294588 - Enable NV printf DTests

Add hipLimitPrintfFifoSize.
Add hipDeviceSetLimit(hipLimit_t limit, size_t value) for NV.

Change-Id: Ife884e0c3081b317bdadc8bec7814d1d7c60153a
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 1e1fbd13a4..76da17e1a4 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -281,6 +281,7 @@ typedef enum cudaMemRangeAttribute hipMemRangeAttribute;
 #define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
 #define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE
 #define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END
+#define hipLimitPrintfFifoSize cudaLimitPrintfFifoSize
 #define hipLimitMallocHeapSize cudaLimitMallocHeapSize
 #define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess
 
@@ -1969,6 +1970,10 @@ inline static hipError_t hipDeviceGetLimit(size_t* pValue, hipLimit_t limit) {
     return hipCUDAErrorTohipError(cudaDeviceGetLimit(pValue, limit));
 }
 
+inline static hipError_t hipDeviceSetLimit(hipLimit_t limit, size_t value) {
+    return hipCUDAErrorTohipError(cudaDeviceSetLimit(limit, value));
+}
+
 inline static hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device) {
     return hipCUResultTohipError(cuDeviceTotalMem(bytes, device));
 }

From 3efa0aa254f34f3ad3854af87a0daf49bcc56db9 Mon Sep 17 00:00:00 2001
From: Tao Sang <tao.sang@amd.com>
Date: Sun, 25 Jul 2021 20:59:49 -0400
Subject: [PATCH 019/177] SWDEV-294594 - Fix hipPointerGetAttributes test on NV

Replace return value of hipPointerGetAttributes() on NV from
hipErrorUnknown to hipErrorInvalidValue if memory has been freed.

Change-Id: I3fe6dbc35a7a14aa9109df297b7885df83d28149
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 76da17e1a4..a53d68e72e 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1699,7 +1699,7 @@ inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attribut
                 attributes->memoryType = hipMemoryTypeHost;
                 break;
             default:
-                return hipErrorUnknown;
+                return hipErrorInvalidValue;
         }
         attributes->device = cPA.device;
         attributes->devicePointer = cPA.devicePointer;

From ce3a40699c0d9b4c780317ac1a968ac468de3542 Mon Sep 17 00:00:00 2001
From: Sarbojit Sarkar <Sarbojit.Sarkar@amd.com>
Date: Thu, 22 Jul 2021 13:08:52 +0000
Subject: [PATCH 020/177] SWDEV-292547 - hipStreamPerThread support

Change-Id: Id621ce073b0fee9eac03c59ffb78b197fda4ddb5
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a53d68e72e..fbe70bab8a 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -198,6 +198,9 @@ inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
     }
 }
 
+// hipStreamPerThread
+#define hipStreamPerThread ((cudaStream_t)2)
+
 #define hipTexRef CUtexref
 #define hiparray CUarray
 

From a2716fa1af3acf3149f417bf868a7050ef736f2b Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <Anusha.GodavarthySurya@amd.com>
Date: Tue, 20 Jul 2021 04:48:06 -0700
Subject: [PATCH 021/177] SWDEV-240806 - Added API hipGraphAddMemcpyNode1D,
 hipGraphAddEmptyNode, hipGraphExecKernelNodeSetParams

Change-Id: I0d7ec8c0ea1abc3fc0f1e10fa7865f355d9cf2ad
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index fbe70bab8a..5175cebcb5 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2292,6 +2292,13 @@ inline static hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGr
         cudaGraphAddMemcpyNode(pGraphNode, graph, pDependencies, numDependencies, pCopyParams));
 }
 
+inline static hipError_t hipGraphAddMemcpyNode1D(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                   const hipGraphNode_t* pDependencies, size_t numDependencies,
+                                   void* dst, const void* src, size_t count, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(
+        cudaGraphAddMemcpyNode1D(pGraphNode, graph, pDependencies, numDependencies, dst, src, count, kind));
+}
+
 inline static hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
                                                const hipGraphNode_t* pDependencies,
                                                size_t numDependencies,

From 6e7072b980c4ccdb6b068b20ebac24ab332f3fff Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Thu, 2 Sep 2021 09:03:04 +0000
Subject: [PATCH 022/177] SWDEV-301330 - Remove the hard coded driver version
 on HIP CUDA platform

Change-Id: I0eb10781acc7524174c7ae6fc552c824b7c94b0a
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 5175cebcb5..97032b1f72 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1785,12 +1785,7 @@ inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallb
 }
 
 inline static hipError_t hipDriverGetVersion(int* driverVersion) {
-    cudaError_t err = cudaDriverGetVersion(driverVersion);
-
-    // Override driver version to match version reported on HCC side.
-    *driverVersion = 4;
-
-    return hipCUDAErrorTohipError(err);
+    return hipCUDAErrorTohipError(cudaDriverGetVersion(driverVersion));
 }
 
 inline static hipError_t hipRuntimeGetVersion(int* runtimeVersion) {

From e24e026f37b46882a353784e19a12517fbcbd5ce Mon Sep 17 00:00:00 2001
From: Sarbojit Sarkar <sarbojit.sarkar@amd.com>
Date: Mon, 20 Sep 2021 07:28:25 +0000
Subject: [PATCH 023/177] SWDEV-298667 - fix stream operation for Cuda

Change-Id: Ida91712e678e324b54293ba48dfca442be390783
---
 .../include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 97032b1f72..df8f1a5fa8 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -85,6 +85,12 @@ typedef enum hipMemoryAdvise {
 #define STREAM_OPS_WAIT_MASK_32 0xFFFFFFFF
 #define STREAM_OPS_WAIT_MASK_64 0xFFFFFFFFFFFFFFFF
 
+// stream operation flags
+#define hipStreamWaitValueGte CU_STREAM_WAIT_VALUE_GEQ
+#define hipStreamWaitValueEq  CU_STREAM_WAIT_VALUE_EQ
+#define hipStreamWaitValueAnd CU_STREAM_WAIT_VALUE_AND
+#define hipStreamWaitValueNor CU_STREAM_WAIT_VALUE_NOR
+
 // hipLibraryPropertyType
 #define hipLibraryPropertyType libraryPropertyType
 #define HIP_LIBRARY_MAJOR_VERSION MAJOR_VERSION
@@ -2380,7 +2386,7 @@ inline static hipError_t hipStreamWriteValue64(hipStream_t stream,
 
 inline static hipError_t hipStreamWaitValue32(hipStream_t stream,
                                               void* ptr, int32_t value, unsigned int flags,
-                                              uint32_t mask) {
+                                              uint32_t mask __dparm(0xFFFFFFFF)) {
    if (value < 0) {
      printf("Warning! value is negative, CUDA accept positive values\n");
    }
@@ -2393,7 +2399,7 @@ inline static hipError_t hipStreamWaitValue32(hipStream_t stream,
 
 inline static hipError_t hipStreamWaitValue64(hipStream_t stream,
                                               void* ptr, int64_t value, unsigned int flags,
-                                              uint64_t mask) {
+                                              uint64_t mask __dparm(0xFFFFFFFFFFFFFFFF)) {
    if (value < 0) {
      printf("Warning! value is negative, CUDA accept positive values\n");
    }

From 7134d7bb00dd5a1a8290d5e8367fc2b59dd9e9ac Mon Sep 17 00:00:00 2001
From: Sarbojit Sarkar <sarbojit.sarkar@amd.com>
Date: Thu, 23 Sep 2021 09:53:00 +0000
Subject: [PATCH 024/177] SWDEV-304076 - Fix for Cuda build error

Change-Id: Ibca63a08b8d6d1235b51a06c0bc024cd284cee97
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 23 ++++++++++++-------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index df8f1a5fa8..f5efa51032 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -50,6 +50,17 @@ extern "C" {
 #define __HIP_DEPRECATED
 #endif
 
+// Add Deprecated Support for CUDA Mapped HIP APIs
+#if defined(__DOXYGEN_ONLY__) || defined(HIP_ENABLE_DEPRECATED)
+#define __HIP_DEPRECATED_MSG(msg)
+#elif defined(_MSC_VER)
+#define __HIP_DEPRECATED_MSG(msg) __declspec(deprecated(msg))
+#elif defined(__GNUC__)
+#define __HIP_DEPRECATED_MSG(msg) __attribute__((deprecated(msg)))
+#else
+#define __HIP_DEPRECATED_MSG(msg)
+#endif
+
 
 // TODO -move to include/hip_runtime_api.h as a common implementation.
 /**
@@ -998,20 +1009,17 @@ inline static hipError_t hipMalloc3D(hipPitchedPtr* pitchedDevPtr, hipExtent ext
 
 inline static hipError_t hipFree(void* ptr) { return hipCUDAErrorTohipError(cudaFree(ptr)); }
 
-inline static hipError_t hipMallocHost(void** ptr, size_t size)
-    __attribute__((deprecated("use hipHostMalloc instead")));
+__HIP_DEPRECATED_MSG("use hipHostMalloc instead")
 inline static hipError_t hipMallocHost(void** ptr, size_t size) {
     return hipCUDAErrorTohipError(cudaMallocHost(ptr, size));
 }
 
-inline static hipError_t hipMemAllocHost(void** ptr, size_t size)
-    __attribute__((deprecated("use hipHostMalloc instead")));
+__HIP_DEPRECATED_MSG("use hipHostMalloc instead")
 inline static hipError_t hipMemAllocHost(void** ptr, size_t size) {
     return hipCUResultTohipError(cuMemAllocHost(ptr, size));
 }
 
-inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags)
-    __attribute__((deprecated("use hipHostMalloc instead")));
+__HIP_DEPRECATED_MSG("use hipHostMalloc instead")
 inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) {
     return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
 }
@@ -1087,8 +1095,7 @@ inline static hipError_t hipHostUnregister(void* ptr) {
     return hipCUDAErrorTohipError(cudaHostUnregister(ptr));
 }
 
-inline static hipError_t hipFreeHost(void* ptr)
-    __attribute__((deprecated("use hipHostFree instead")));
+__HIP_DEPRECATED_MSG("use hipHostFree instead")
 inline static hipError_t hipFreeHost(void* ptr) {
     return hipCUDAErrorTohipError(cudaFreeHost(ptr));
 }

From c457f5cd121eb982f614935843e1b9bafd31ed3c Mon Sep 17 00:00:00 2001
From: Jatin Chaudhary <jatin.chaudhary@amd.com>
Date: Wed, 29 Sep 2021 23:32:49 -0700
Subject: [PATCH 025/177] SWDEV-304552 - Add default to return the last error
 value

Change-Id: Ia399033e30aaa454a454067f5afc6f7d1e5ff8dd
---
 hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h b/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
index 449fe342af..db5657e6e6 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
@@ -76,6 +76,8 @@ inline static nvrtcResult hiprtcResultTonvrtcResult(hiprtcResult result) {
       return NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
     case HIPRTC_ERROR_INTERNAL_ERROR:
       return NVRTC_ERROR_INTERNAL_ERROR;
+    default:
+      return NVRTC_ERROR_INTERNAL_ERROR;
   }
 }
 
@@ -105,6 +107,8 @@ inline static hiprtcResult nvrtcResultTohiprtcResult(nvrtcResult result) {
       return HIPRTC_ERROR_NAME_EXPRESSION_NOT_VALID;
     case NVRTC_ERROR_INTERNAL_ERROR:
       return HIPRTC_ERROR_INTERNAL_ERROR;
+    default:
+      return HIPRTC_ERROR_INTERNAL_ERROR;
   }
 }
 

From 6f942ccdd446f50a58bd1222660bca53b7fe3e0e Mon Sep 17 00:00:00 2001
From: anusha GodavarthySurya <Anusha.GodavarthySurya@amd.com>
Date: Tue, 5 Oct 2021 07:34:04 -0700
Subject: [PATCH 026/177] SWDEV-240806 - Added few Graph API Implementation

hipGraph APIs clone, childGraph, RemoveDependencies, GetEdges,GetDependencies, GetDependentNodes, GetType and DestroyNode
hipStream APIs GetCaptureInfo, GetCaptureInfo_v2, UpdateCaptureDependencies.

Change-Id: Ib0f4cb8ea335698eecdd6d744ffab1e954153673
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 161 ++++++++++++++----
 1 file changed, 127 insertions(+), 34 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index f5efa51032..a5b3c556ae 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -974,6 +974,10 @@ typedef enum cudaStreamCaptureStatus hipStreamCaptureStatus;
 #define hipStreamCaptureStatusActive cudaStreamCaptureStatusActive
 #define hipStreamCaptureStatusInvalidated cudaStreamCaptureStatusInvalidated
 
+typedef enum cudaStreamUpdateCaptureDependenciesFlags hipStreamUpdateCaptureDependenciesFlags;
+#define hipStreamAddCaptureDependencies cudaStreamAddCaptureDependencies
+#define hipStreamSetCaptureDependencies cudaStreamSetCaptureDependencies
+
 /**
  * Stream CallBack struct
  */
@@ -2373,48 +2377,137 @@ inline static hipError_t hipGraphAddEmptyNode(hipGraphNode_t* pGraphNode, hipGra
       cudaGraphAddEmptyNode(pGraphNode, graph, pDependencies, numDependencies));
 }
 
-inline static hipError_t hipStreamWriteValue32(hipStream_t stream,
-                                               void* ptr, int32_t value, unsigned int flags) {
-   if (value < 0) {
-     printf("Warning! value is negative, CUDA accept positive values\n");
-   }
-   return hipCUResultTohipError(cuStreamWriteValue32(stream, reinterpret_cast<CUdeviceptr>(ptr),
-                                                             static_cast<cuuint32_t>(value), flags));
+inline static hipError_t hipStreamWriteValue32(hipStream_t stream, void* ptr, int32_t value,
+                                               unsigned int flags) {
+    if (value < 0) {
+        printf("Warning! value is negative, CUDA accept positive values\n");
+    }
+    return hipCUResultTohipError(cuStreamWriteValue32(stream, reinterpret_cast<CUdeviceptr>(ptr),
+                                                      static_cast<cuuint32_t>(value), flags));
 }
 
-inline static hipError_t hipStreamWriteValue64(hipStream_t stream,
-                                               void* ptr, int64_t value, unsigned int flags) {
-   if (value < 0) {
-     printf("Warning! value is negative, CUDA accept positive values\n");
-   }
-   return hipCUResultTohipError(cuStreamWriteValue64(stream, reinterpret_cast<CUdeviceptr>(ptr),
-                                                    static_cast<cuuint64_t>(value), flags));
+inline static hipError_t hipStreamWriteValue64(hipStream_t stream, void* ptr, int64_t value,
+                                               unsigned int flags) {
+    if (value < 0) {
+        printf("Warning! value is negative, CUDA accept positive values\n");
+    }
+    return hipCUResultTohipError(cuStreamWriteValue64(stream, reinterpret_cast<CUdeviceptr>(ptr),
+                                                      static_cast<cuuint64_t>(value), flags));
 }
 
-inline static hipError_t hipStreamWaitValue32(hipStream_t stream,
-                                              void* ptr, int32_t value, unsigned int flags,
+inline static hipError_t hipStreamWaitValue32(hipStream_t stream, void* ptr, int32_t value,
+                                              unsigned int flags,
                                               uint32_t mask __dparm(0xFFFFFFFF)) {
-   if (value < 0) {
-     printf("Warning! value is negative, CUDA accept positive values\n");
-   }
-   if (mask != STREAM_OPS_WAIT_MASK_32) {
-     printf("Warning! mask will not have impact as CUDA ignores it.\n");
-   }
-   return hipCUResultTohipError(cuStreamWaitValue32(stream, reinterpret_cast<CUdeviceptr>(ptr),
-                                                    static_cast<cuuint32_t>(value), flags));
+    if (value < 0) {
+        printf("Warning! value is negative, CUDA accept positive values\n");
+    }
+    if (mask != STREAM_OPS_WAIT_MASK_32) {
+        printf("Warning! mask will not have impact as CUDA ignores it.\n");
+    }
+    return hipCUResultTohipError(cuStreamWaitValue32(stream, reinterpret_cast<CUdeviceptr>(ptr),
+                                                     static_cast<cuuint32_t>(value), flags));
 }
 
-inline static hipError_t hipStreamWaitValue64(hipStream_t stream,
-                                              void* ptr, int64_t value, unsigned int flags,
+inline static hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, int64_t value,
+                                              unsigned int flags,
                                               uint64_t mask __dparm(0xFFFFFFFFFFFFFFFF)) {
-   if (value < 0) {
-     printf("Warning! value is negative, CUDA accept positive values\n");
-   }
-   if (mask != STREAM_OPS_WAIT_MASK_64) {
-     printf("Warning! mask will not have impact as CUDA ignores it.\n");
-   }
-   return hipCUResultTohipError(cuStreamWaitValue64(stream, reinterpret_cast<CUdeviceptr>(ptr),
-                                                    static_cast<cuuint64_t>(value), flags));
+    if (value < 0) {
+        printf("Warning! value is negative, CUDA accept positive values\n");
+    }
+    if (mask != STREAM_OPS_WAIT_MASK_64) {
+        printf("Warning! mask will not have impact as CUDA ignores it.\n");
+    }
+    return hipCUResultTohipError(cuStreamWaitValue64(stream, reinterpret_cast<CUdeviceptr>(ptr),
+                                                     static_cast<cuuint64_t>(value), flags));
+}
+
+inline static hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* from,
+                                                    const hipGraphNode_t* to,
+                                                    size_t numDependencies) {
+    return hipCUDAErrorTohipError(cudaGraphRemoveDependencies(graph, from, to, numDependencies));
+}
+
+
+inline static hipError_t hipGraphGetEdges(hipGraph_t graph, hipGraphNode_t* from,
+                                          hipGraphNode_t* to, size_t* numEdges) {
+    return hipCUDAErrorTohipError(cudaGraphGetEdges(graph, from, to, numEdges));
+}
+
+
+inline static hipError_t hipGraphNodeGetDependencies(hipGraphNode_t node,
+                                                     hipGraphNode_t* pDependencies,
+                                                     size_t* pNumDependencies) {
+    return hipCUDAErrorTohipError(
+        cudaGraphNodeGetDependencies(node, pDependencies, pNumDependencies));
+}
+
+inline static hipError_t hipGraphNodeGetDependentNodes(hipGraphNode_t node,
+                                                       hipGraphNode_t* pDependentNodes,
+                                                       size_t* pNumDependentNodes) {
+    return hipCUDAErrorTohipError(
+        cudaGraphNodeGetDependentNodes(node, pDependentNodes, pNumDependentNodes));
+}
+
+inline static hipError_t hipGraphNodeGetType(hipGraphNode_t node, hipGraphNodeType* pType) {
+    return hipCUDAErrorTohipError(cudaGraphNodeGetType(node, pType));
+}
+
+inline static hipError_t hipGraphDestroyNode(hipGraphNode_t node) {
+    return hipCUDAErrorTohipError(cudaGraphDestroyNode(node));
+}
+
+inline static hipError_t hipGraphClone(hipGraph_t* pGraphClone, hipGraph_t originalGraph) {
+    return hipCUDAErrorTohipError(cudaGraphClone(pGraphClone, originalGraph));
+}
+
+inline static hipError_t hipGraphNodeFindInClone(hipGraphNode_t* pNode, hipGraphNode_t originalNode,
+                                                 hipGraph_t clonedGraph) {
+    return hipCUDAErrorTohipError(cudaGraphNodeFindInClone(pNode, originalNode, clonedGraph));
+}
+
+inline static hipError_t hipGraphAddChildGraphNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                                   const hipGraphNode_t* pDependencies,
+                                                   size_t numDependencies, hipGraph_t childGraph) {
+    return hipCUDAErrorTohipError(
+        cudaGraphAddChildGraphNode(pGraphNode, graph, pDependencies, numDependencies, childGraph));
+}
+
+inline static hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph) {
+    return hipCUDAErrorTohipError(cudaGraphChildGraphNodeGetGraph(node, pGraph));
+}
+
+inline static hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec,
+                                                             hipGraphNode_t node,
+                                                             hipGraph_t childGraph) {
+    return hipCUDAErrorTohipError(
+        cudaGraphExecChildGraphNodeSetParams(hGraphExec, node, childGraph));
+}
+
+
+inline static hipError_t hipStreamGetCaptureInfo(hipStream_t stream,
+                                                 hipStreamCaptureStatus* pCaptureStatus,
+                                                 unsigned long long* pId) {
+    return hipCUDAErrorTohipError(cudaStreamGetCaptureInfo(stream, pCaptureStatus, pId));
+}
+
+inline static hipError_t hipStreamGetCaptureInfo_v2(
+    hipStream_t stream, hipStreamCaptureStatus* captureStatus_out,
+    unsigned long long* id_out __dparm(0), hipGraph_t* graph_out __dparm(0),
+    const hipGraphNode_t** dependencies_out __dparm(0), size_t* numDependencies_out __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaStreamGetCaptureInfo_v2(
+        stream, captureStatus_out, id_out, graph_out, dependencies_out, numDependencies_out));
+}
+
+inline static hipError_t hipStreamIsCapturing(hipStream_t stream,
+                                              hipStreamCaptureStatus* pCaptureStatus) {
+    return hipCUDAErrorTohipError(cudaStreamIsCapturing(stream, pCaptureStatus));
+}
+
+inline static hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream,
+                                                            hipGraphNode_t* dependencies,
+                                                            size_t numDependencies,
+                                                            unsigned int flags __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaStreamUpdateCaptureDependencies(stream, dependencies, flags));
 }
 
 #endif  //__CUDACC__

From 910fa7e2fa501137e8febecbd1f3dc9946935ebc Mon Sep 17 00:00:00 2001
From: Tao Sang <tao.sang@amd.com>
Date: Thu, 21 Oct 2021 17:36:14 -0400
Subject: [PATCH 027/177] SWDEV-306947 Enable hipAddressModeBorder

Enable hipAddressModeBorder.
Fix default of height of hipMallocArray().
Some code improvement.

Change-Id: I57045118e7adf915074c547cbe76349a4cfd72d8
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a5b3c556ae..d4facc9360 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1069,7 +1069,7 @@ inline static hipError_t hipMallocManaged(void** ptr, size_t size, unsigned int
 }
 
 inline static hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
-                                        size_t width, size_t height,
+                                        size_t width, size_t height __dparm(0),
                                         unsigned int flags __dparm(hipArrayDefault)) {
     return hipCUDAErrorTohipError(cudaMallocArray(array, desc, width, height, flags));
 }

From 198ad718a546795ce882f70ce7feb8e49861565f Mon Sep 17 00:00:00 2001
From: anusha GodavarthySurya <Anusha.GodavarthySurya@amd.com>
Date: Mon, 22 Nov 2021 07:17:34 -0800
Subject: [PATCH 028/177] SWDEV-240808 - Added nvidia handling for few graph
 APIs

Change-Id: I8edee87ea75d80152418f76cc8a521b2ba58f139
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 151 +++++++++++++++++-
 1 file changed, 149 insertions(+), 2 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index d4facc9360..657d354a88 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2433,7 +2433,6 @@ inline static hipError_t hipGraphGetEdges(hipGraph_t graph, hipGraphNode_t* from
     return hipCUDAErrorTohipError(cudaGraphGetEdges(graph, from, to, numEdges));
 }
 
-
 inline static hipError_t hipGraphNodeGetDependencies(hipGraphNode_t node,
                                                      hipGraphNode_t* pDependencies,
                                                      size_t* pNumDependencies) {
@@ -2483,7 +2482,6 @@ inline static hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGra
         cudaGraphExecChildGraphNodeSetParams(hGraphExec, node, childGraph));
 }
 
-
 inline static hipError_t hipStreamGetCaptureInfo(hipStream_t stream,
                                                  hipStreamCaptureStatus* pCaptureStatus,
                                                  unsigned long long* pId) {
@@ -2510,6 +2508,155 @@ inline static hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream,
     return hipCUDAErrorTohipError(cudaStreamUpdateCaptureDependencies(stream, dependencies, flags));
 }
 
+inline static hipError_t hipGraphAddEventRecordNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                                    const hipGraphNode_t* pDependencies,
+                                                    size_t numDependencies, hipEvent_t event) {
+    return hipCUDAErrorTohipError(
+        cudaGraphAddEventRecordNode(pGraphNode, graph, pDependencies, numDependencies, event));
+}
+
+inline static hipError_t hipGraphAddEventWaitNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                                  const hipGraphNode_t* pDependencies,
+                                                  size_t numDependencies, hipEvent_t event) {
+    return hipCUDAErrorTohipError(
+        cudaGraphAddEventWaitNode(pGraphNode, graph, pDependencies, numDependencies, event));
+}
+
+inline static hipError_t hipGraphAddHostNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                             const hipGraphNode_t* pDependencies,
+                                             size_t numDependencies,
+                                             const hipHostNodeParams* pNodeParams) {
+    return hipCUDAErrorTohipError(
+        cudaGraphAddHostNode(pGraphNode, graph, pDependencies, numDependencies, pNodeParams));
+}
+
+inline static hipError_t hipGraphAddMemcpyNodeFromSymbol(hipGraphNode_t* pGraphNode,
+                                                         hipGraph_t graph,
+                                                         const hipGraphNode_t* pDependencies,
+                                                         size_t numDependencies, void* dst,
+                                                         const void* symbol, size_t count,
+                                                         size_t offset, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaGraphAddMemcpyNodeFromSymbol(
+        pGraphNode, graph, pDependencies, numDependencies, dst, symbol, count, offset, kind));
+}
+
+inline static hipError_t hipGraphAddMemcpyNodeToSymbol(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                                       const hipGraphNode_t* pDependencies,
+                                                       size_t numDependencies, const void* symbol,
+                                                       const void* src, size_t count, size_t offset,
+                                                       hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaGraphAddMemcpyNodeToSymbol(
+        pGraphNode, graph, pDependencies, numDependencies, symbol, src, count, offset, kind));
+}
+
+inline static hipError_t hipGraphEventRecordNodeSetEvent(hipGraphNode_t node, hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaGraphEventRecordNodeSetEvent(node, event));
+}
+
+inline static hipError_t hipGraphEventWaitNodeGetEvent(hipGraphNode_t node, hipEvent_t* event_out) {
+    return hipCUDAErrorTohipError(cudaGraphEventWaitNodeGetEvent(node, event_out));
+}
+
+inline static hipError_t hipGraphEventWaitNodeSetEvent(hipGraphNode_t node, hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaGraphEventWaitNodeSetEvent(node, event));
+}
+
+inline static hipError_t hipGraphExecHostNodeSetParams(hipGraphExec_t hGraphExec,
+                                                       hipGraphNode_t node,
+                                                       const hipHostNodeParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphExecHostNodeSetParams(hGraphExec, node, pNodeParams));
+}
+
+inline static hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec,
+                                                         hipGraphNode_t node,
+                                                         hipMemcpy3DParms* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams));
+}
+
+inline static hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec,
+                                                           hipGraphNode_t node, void* dst,
+                                                           const void* src, size_t count,
+                                                           hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(
+        cudaGraphExecMemcpyNodeSetParams1D(hGraphExec, node, dst, src, count, kind));
+}
+
+inline static hipError_t hipGraphExecMemcpyNodeSetParamsFromSymbol(hipGraphExec_t hGraphExec,
+                                                                   hipGraphNode_t node, void* dst,
+                                                                   const void* symbol, size_t count,
+                                                                   size_t offset,
+                                                                   hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaGraphExecMemcpyNodeSetParamsFromSymbol(
+        hGraphExec, node, dst, symbol, count, offset, kind));
+}
+
+inline static hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(
+    hipGraphExec_t hGraphExec, hipGraphNode_t node, const void* symbol, const void* src,
+    size_t count, size_t offset, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaGraphExecMemcpyNodeSetParamsToSymbol(
+        hGraphExec, node, symbol, src, count, offset, kind));
+}
+
+inline static hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec,
+                                                         hipGraphNode_t node,
+                                                         const hipMemsetParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphExecMemsetNodeSetParams(hGraphExec, node, pNodeParams));
+}
+
+inline static hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_t hGraph,
+                                            hipGraphNode_t* hErrorNode_out,
+                                            hipGraphExecUpdateResult* updateResult_out) {
+    return hipCUDAErrorTohipError(
+        cudaGraphExecUpdate(hGraphExec, hGraph, hErrorNode_out, updateResult_out));
+}
+
+inline static hipError_t hipGraphMemcpyNodeSetParamsFromSymbol(hipGraphNode_t node, void* dst,
+                                                               const void* symbol, size_t count,
+                                                               size_t offset, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(
+        cudaGraphMemcpyNodeSetParamsFromSymbol(node, dst, symbol, count, offset, kind));
+}
+
+inline static hipError_t hipGraphMemcpyNodeSetParamsToSymbol(hipGraphNode_t node,
+                                                             const void* symbol, const void* src,
+                                                             size_t count, size_t offset,
+                                                             hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(
+        cudaGraphMemcpyNodeSetParamsToSymbol(node, symbol, src, count, offset, kind));
+}
+
+inline static hipError_t hipGraphEventRecordNodeGetEvent(hipGraphNode_t node,
+                                                         hipEvent_t* event_out) {
+    return hipCUDAErrorTohipError(cudaGraphEventRecordNodeGetEvent(node, event_out));
+}
+
+inline static hipError_t hipGraphHostNodeGetParams(hipGraphNode_t node,
+                                                   hipHostNodeParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphHostNodeGetParams(node, pNodeParams));
+}
+
+inline static hipError_t hipGraphMemcpyNodeSetParams1D(hipGraphNode_t node, void* dst,
+                                                       const void* src, size_t count,
+                                                       hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaGraphMemcpyNodeSetParams1D(node, dst, src, count, kind));
+}
+
+inline static hipError_t hipGraphExecEventRecordNodeSetEvent(hipGraphExec_t hGraphExec,
+                                                             hipGraphNode_t hNode,
+                                                             hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaGraphExecEventRecordNodeSetEvent(hGraphExec, hNode, event));
+}
+
+inline static hipError_t hipGraphExecEventWaitNodeSetEvent(hipGraphExec_t hGraphExec,
+                                                           hipGraphNode_t hNode, hipEvent_t event) {
+    return hipCUDAErrorTohipError(cudaGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event));
+}
+
+inline static hipError_t hipGraphHostNodeSetParams(hipGraphNode_t node,
+                                                   const hipHostNodeParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphHostNodeSetParams(node, pNodeParams));
+}
+
 #endif  //__CUDACC__
 
 #endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H

From 5a4305c05a4199388123c74fdc9250b363c49aed Mon Sep 17 00:00:00 2001
From: haoyuan2 <Hao.Yuan@amd.com>
Date: Mon, 22 Nov 2021 10:21:16 -0800
Subject: [PATCH 029/177] SWDEV-309292 - add hipPos definition in CUDA path

Change-Id: Ia81d5db77d7403cafb583fda8083d0f4dbe33835
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 657d354a88..6e7989e1e2 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -391,6 +391,7 @@ typedef cudaSurfaceObject_t hipSurfaceObject_t;
 #define hipInvalidDeviceId cudaInvalidDeviceId
 typedef struct cudaExtent hipExtent;
 typedef struct cudaPitchedPtr hipPitchedPtr;
+typedef struct cudaPos hipPos;
 #define make_hipExtent make_cudaExtent
 #define make_hipPos make_cudaPos
 #define make_hipPitchedPtr make_cudaPitchedPtr

From 1a017b591e3fb2fea92b1fdf6d9b4413525ef92d Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Mon, 27 Sep 2021 19:08:32 +0000
Subject: [PATCH 030/177] SWDEV-292714 - Added support for
 hipPointerGetAttribute and hipDrvPointerGetAttributes

Change-Id: I44a0b5c5d9eb5a9a63406b11e8de83e5cb245a59
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 6e7989e1e2..3cc16415eb 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -362,6 +362,7 @@ typedef struct cudaArray* hipArray_const_t;
 typedef struct cudaFuncAttributes hipFuncAttributes;
 typedef struct cudaLaunchParams hipLaunchParams;
 #define hipFunction_attribute CUfunction_attribute
+#define hipPointer_attribute CUpointer_attribute
 #define hip_Memcpy2D CUDA_MEMCPY2D
 #define HIP_MEMCPY3D CUDA_MEMCPY3D
 #define hipMemcpy3DParms cudaMemcpy3DParms
@@ -421,6 +422,25 @@ typedef struct cudaResourceViewDesc hipResourceViewDesc;
 #define HIP_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT
 #define HIP_FUNC_ATTRIBUTE_MAX CU_FUNC_ATTRIBUTE_MAX
 
+//Pointer Attributes
+#define HIP_POINTER_ATTRIBUTE_CONTEXT           CU_POINTER_ATTRIBUTE_CONTEXT
+#define HIP_POINTER_ATTRIBUTE_MEMORY_TYPE       CU_POINTER_ATTRIBUTE_MEMORY_TYPE
+#define HIP_POINTER_ATTRIBUTE_DEVICE_POINTER    CU_POINTER_ATTRIBUTE_DEVICE_POINTER
+#define HIP_POINTER_ATTRIBUTE_HOST_POINTER      CU_POINTER_ATTRIBUTE_HOST_POINTER
+#define HIP_POINTER_ATTRIBUTE_P2P_TOKENS        CU_POINTER_ATTRIBUTE_P2P_TOKENS
+#define HIP_POINTER_ATTRIBUTE_SYNC_MEMOPS       CU_POINTER_ATTRIBUTE_SYNC_MEMOPS
+#define HIP_POINTER_ATTRIBUTE_BUFFER_ID         CU_POINTER_ATTRIBUTE_BUFFER_ID
+#define HIP_POINTER_ATTRIBUTE_IS_MANAGED        CU_POINTER_ATTRIBUTE_IS_MANAGED
+#define HIP_POINTER_ATTRIBUTE_DEVICE_ORDINAL    CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL
+#define HIP_POINTER_ATTRIBUTE_IS_LEGACY_HIP_IPC_CAPABLE  CU_POINTER_ATTRIBUTE_IS_LEGACY_CUDA_IPC_CAPABLE
+#define HIP_POINTER_ATTRIBUTE_RANGE_START_ADDR  CU_POINTER_ATTRIBUTE_RANGE_START_ADDR
+#define HIP_POINTER_ATTRIBUTE_RANGE_SIZE        CU_POINTER_ATTRIBUTE_RANGE_SIZE
+#define HIP_POINTER_ATTRIBUTE_MAPPED            CU_POINTER_ATTRIBUTE_MAPPED
+#define HIP_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES CU_POINTER_ATTRIBUTE_ALLOWED_HANDLE_TYPES
+#define HIP_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE CU_POINTER_ATTRIBUTE_IS_GPU_DIRECT_RDMA_CAPABLE
+#define HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS      CU_POINTER_ATTRIBUTE_ACCESS_FLAGS
+#define HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE    CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
+
 #if CUDA_VERSION >= 9000
 #define __shfl(...)      __shfl_sync(0xffffffff, __VA_ARGS__)
 #define __shfl_up(...)   __shfl_up_sync(0xffffffff, __VA_ARGS__)
@@ -1731,6 +1751,17 @@ inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attribut
     return err;
 }
 
+inline static hipError_t hipPointerGetAttribute(void* data, hipPointer_attribute attribute,
+                                                hipDeviceptr_t ptr) {
+    return hipCUResultTohipError(cuPointerGetAttribute(data, attribute, ptr));
+}
+
+inline static hipError_t hipDrvPointerGetAttributes(unsigned int numAttributes,
+                                                    hipPointer_attribute* attributes,
+                                                    void** data, hipDeviceptr_t ptr) {
+    return hipCUResultTohipError(cuPointerGetAttributes(numAttributes, attributes, data, ptr));
+}
+
 inline static hipError_t hipMemGetInfo(size_t* free, size_t* total) {
     return hipCUDAErrorTohipError(cudaMemGetInfo(free, total));
 }

From d64b750aec2d420ea52622a0951ea26dcb779153 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Thu, 6 Jan 2022 13:42:24 +0000
Subject: [PATCH 031/177] SWDEV-314661 - Add hipGraphInstantiateWithFlags API
 mapping in nvidia path

Change-Id: Ibbf54ef18978448e92571f0535124844154726b6
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 3cc16415eb..36804e1325 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2316,6 +2316,11 @@ inline static hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGrap
         cudaGraphInstantiate(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize));
 }
 
+inline static hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph,
+                                                      unsigned long long flags) {
+    return hipCUDAErrorTohipError(cudaGraphInstantiateWithFlags(pGraphExec, graph, flags));
+}
+
 inline static hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream) {
     return hipCUDAErrorTohipError(cudaGraphLaunch(graphExec, stream));
 }

From 334f3c644964b4bc4be77ff3f89a56a595363c7d Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Thu, 6 Jan 2022 13:20:18 +0000
Subject: [PATCH 032/177] SWDEV-317135 - Fix mapping of
 hipStreamUpdateCaptureDependencies on cuda path

Change-Id: Ibbcf8136fa527a20684a377311bc76182345d17b
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 36804e1325..9f1539bb71 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2542,7 +2542,8 @@ inline static hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream,
                                                             hipGraphNode_t* dependencies,
                                                             size_t numDependencies,
                                                             unsigned int flags __dparm(0)) {
-    return hipCUDAErrorTohipError(cudaStreamUpdateCaptureDependencies(stream, dependencies, flags));
+    return hipCUDAErrorTohipError(cudaStreamUpdateCaptureDependencies(stream, dependencies,
+                                                                      numDependencies, flags));
 }
 
 inline static hipError_t hipGraphAddEventRecordNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,

From c4dcd3398c5a0f2eeb5b915ab27b97b0bf75c5c1 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Wed, 5 Jan 2022 14:51:42 +0000
Subject: [PATCH 033/177] SWDEV-315017 - Add missing graph error strings

Change-Id: I11b5a623756c5bef88cbc93e49c124d0caf62bd1
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 9f1539bb71..cd71094e49 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -474,6 +474,8 @@ inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
             return hipErrorSharedObjectInitFailed;
         case cudaErrorOperatingSystem:
             return hipErrorOperatingSystem;
+        case cudaErrorIllegalState:
+            return hipErrorIllegalState;
         case cudaErrorSetOnActiveProcess:
             return hipErrorSetOnActiveProcess;
         case cudaErrorIllegalAddress:
@@ -502,6 +504,8 @@ inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
             return hipErrorInvalidDevice;
         case cudaErrorInvalidValue:
             return hipErrorInvalidValue;
+        case cudaErrorInvalidPitchValue:
+            return hipErrorInvalidPitchValue;
         case cudaErrorInvalidDevicePointer:
             return hipErrorInvalidDevicePointer;
         case cudaErrorInvalidMemcpyDirection:
@@ -516,6 +520,8 @@ inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
             return hipErrorPeerAccessAlreadyEnabled;
         case cudaErrorPeerAccessNotEnabled:
             return hipErrorPeerAccessNotEnabled;
+        case cudaErrorContextIsDestroyed:
+            return hipErrorContextIsDestroyed;
         case cudaErrorHostMemoryAlreadyRegistered:
             return hipErrorHostMemoryAlreadyRegistered;
         case cudaErrorHostMemoryNotRegistered:
@@ -566,6 +572,26 @@ inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
         case cudaErrorDeviceUninitialized:
             return hipErrorInvalidContext;
 #endif
+        case cudaErrorStreamCaptureUnsupported:
+            return hipErrorStreamCaptureUnsupported;
+        case cudaErrorStreamCaptureInvalidated:
+            return hipErrorStreamCaptureInvalidated;
+        case cudaErrorStreamCaptureMerge:
+            return hipErrorStreamCaptureMerge;
+        case cudaErrorStreamCaptureUnmatched:
+            return hipErrorStreamCaptureUnmatched;
+        case cudaErrorStreamCaptureUnjoined:
+            return hipErrorStreamCaptureUnjoined;
+        case cudaErrorStreamCaptureIsolation:
+            return hipErrorStreamCaptureIsolation;
+        case cudaErrorStreamCaptureImplicit:
+            return hipErrorStreamCaptureImplicit;
+        case cudaErrorCapturedEvent:
+            return hipErrorCapturedEvent;
+        case cudaErrorStreamCaptureWrongThread:
+            return hipErrorStreamCaptureWrongThread;
+        case cudaErrorGraphExecUpdateFailure:
+            return hipErrorGraphExecUpdateFailure;
         case cudaErrorUnknown:
         default:
             return hipErrorUnknown;  // Note - translated error.
@@ -644,6 +670,8 @@ inline static hipError_t hipCUResultTohipError(CUresult cuError) {
             return hipErrorSharedObjectInitFailed;
         case CUDA_ERROR_OPERATING_SYSTEM:
             return hipErrorOperatingSystem;
+        case CUDA_ERROR_ILLEGAL_STATE:
+            return hipErrorIllegalState;
         case CUDA_ERROR_NOT_FOUND:
             return hipErrorNotFound;
         case CUDA_ERROR_NOT_READY:
@@ -660,6 +688,8 @@ inline static hipError_t hipCUResultTohipError(CUresult cuError) {
             return hipErrorPeerAccessNotEnabled;
         case CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE:
             return hipErrorSetOnActiveProcess;
+        case CUDA_ERROR_CONTEXT_IS_DESTROYED:
+            return hipErrorContextIsDestroyed;
         case CUDA_ERROR_ASSERT:
             return hipErrorAssert;
         case CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED:
@@ -672,6 +702,26 @@ inline static hipError_t hipCUResultTohipError(CUresult cuError) {
             return hipErrorCooperativeLaunchTooLarge;
         case CUDA_ERROR_NOT_SUPPORTED:
             return hipErrorNotSupported;
+        case CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED:
+            return hipErrorStreamCaptureUnsupported;
+        case CUDA_ERROR_STREAM_CAPTURE_INVALIDATED:
+            return hipErrorStreamCaptureInvalidated;
+        case CUDA_ERROR_STREAM_CAPTURE_MERGE:
+            return hipErrorStreamCaptureMerge;
+        case CUDA_ERROR_STREAM_CAPTURE_UNMATCHED:
+            return hipErrorStreamCaptureUnmatched;
+        case CUDA_ERROR_STREAM_CAPTURE_UNJOINED:
+            return hipErrorStreamCaptureUnjoined;
+        case CUDA_ERROR_STREAM_CAPTURE_ISOLATION:
+            return hipErrorStreamCaptureIsolation;
+        case CUDA_ERROR_STREAM_CAPTURE_IMPLICIT:
+            return hipErrorStreamCaptureImplicit;
+        case CUDA_ERROR_CAPTURED_EVENT:
+            return hipErrorCapturedEvent;
+        case CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD:
+            return hipErrorStreamCaptureWrongThread;
+        case CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE:
+            return hipErrorGraphExecUpdateFailure;
         case CUDA_ERROR_UNKNOWN:
         default:
             return hipErrorUnknown;  // Note - translated error.
@@ -698,6 +748,8 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
             return cudaErrorLaunchOutOfResources;
         case hipErrorInvalidValue:
             return cudaErrorInvalidValue;
+        case hipErrorInvalidPitchValue:
+            return cudaErrorInvalidPitchValue;
         case hipErrorInvalidHandle:
             return cudaErrorInvalidResourceHandle;
         case hipErrorInvalidDevice:
@@ -812,6 +864,8 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
             return cudaErrorSharedObjectInitFailed;
         case hipErrorOperatingSystem:
             return cudaErrorOperatingSystem;
+        case hipErrorIllegalState:
+            return cudaErrorIllegalState;
         case hipErrorNotFound:
 #if CUDA_VERSION >= 10010
             return cudaErrorSymbolNotFound;
@@ -824,10 +878,34 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
             return cudaErrorLaunchTimeout;
         case hipErrorSetOnActiveProcess:
             return cudaErrorSetOnActiveProcess;
+        case hipErrorContextIsDestroyed:
+            return cudaErrorContextIsDestroyed;
+        case hipErrorAssert:
+            return cudaErrorAssert;
         case hipErrorLaunchFailure:
             return cudaErrorLaunchFailure;
         case hipErrorCooperativeLaunchTooLarge:
             return cudaErrorCooperativeLaunchTooLarge;
+        case hipErrorStreamCaptureUnsupported:
+            return cudaErrorStreamCaptureUnsupported;
+        case hipErrorStreamCaptureInvalidated:
+            return cudaErrorStreamCaptureInvalidated;
+        case hipErrorStreamCaptureMerge:
+            return cudaErrorStreamCaptureMerge;
+        case hipErrorStreamCaptureUnmatched:
+            return cudaErrorStreamCaptureUnmatched;
+        case hipErrorStreamCaptureUnjoined:
+            return cudaErrorStreamCaptureUnjoined;
+        case hipErrorStreamCaptureIsolation:
+            return cudaErrorStreamCaptureIsolation;
+        case hipErrorStreamCaptureImplicit:
+            return cudaErrorStreamCaptureImplicit;
+        case hipErrorCapturedEvent:
+            return cudaErrorCapturedEvent;
+        case hipErrorStreamCaptureWrongThread:
+            return cudaErrorStreamCaptureWrongThread;
+        case hipErrorGraphExecUpdateFailure:
+            return cudaErrorGraphExecUpdateFailure;
         case hipErrorNotSupported:
             return cudaErrorNotSupported;
         // HSA: does not exist in CUDA

From f607f01311628b3c3e2587f4dbb6891147cba6d2 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Fri, 7 Jan 2022 13:51:44 +0000
Subject: [PATCH 034/177] SWDEV-315118 - Add version checks for few graph APIs
 on Nvidia path

HIP apps running on CUDA 11.0 are failing due to some graph APIs/enums
added only in CUDA 11.1 or CUDA 11.3

Change-Id: I0d32b412cb76c42c7b3a9c612d750990f9e89908
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 54 ++++++++++++++-----
 1 file changed, 40 insertions(+), 14 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index cd71094e49..fdeb80f6bb 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -29,6 +29,12 @@ THE SOFTWARE.
 #include <cuda_fp16.h>
 #include <stdio.h>
 
+#define CUDA_9000 9000
+#define CUDA_10010 10010
+#define CUDA_10020 10020
+#define CUDA_11010 11010
+#define CUDA_11030 11030
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -441,12 +447,12 @@ typedef struct cudaResourceViewDesc hipResourceViewDesc;
 #define HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS      CU_POINTER_ATTRIBUTE_ACCESS_FLAGS
 #define HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE    CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
 
-#if CUDA_VERSION >= 9000
+#if CUDA_VERSION >= CUDA_9000
 #define __shfl(...)      __shfl_sync(0xffffffff, __VA_ARGS__)
 #define __shfl_up(...)   __shfl_up_sync(0xffffffff, __VA_ARGS__)
 #define __shfl_down(...) __shfl_down_sync(0xffffffff, __VA_ARGS__)
 #define __shfl_xor(...)  __shfl_xor_sync(0xffffffff, __VA_ARGS__)
-#endif // CUDA_VERSION >= 9000
+#endif // CUDA_VERSION >= CUDA_9000
 
 inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
     switch (cuError) {
@@ -548,7 +554,7 @@ inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
             return hipErrorInvalidKernelFile;
         case cudaErrorLaunchTimeout:
             return hipErrorLaunchTimeOut;
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= CUDA_10010
         case cudaErrorInvalidSource:
             return hipErrorInvalidSource;
         case cudaErrorFileNotFound:
@@ -568,7 +574,7 @@ inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
         case cudaErrorAlreadyMapped:
             return hipErrorAlreadyMapped;
 #endif
-#if CUDA_VERSION >= 10020
+#if CUDA_VERSION >= CUDA_10020
         case cudaErrorDeviceUninitialized:
             return hipErrorInvalidContext;
 #endif
@@ -787,7 +793,7 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
         case hipErrorInvalidImage:
             return cudaErrorInvalidKernelImage;
         case hipErrorInvalidContext:
-#if CUDA_VERSION >= 10020
+#if CUDA_VERSION >= CUDA_10020
             return cudaErrorDeviceUninitialized;
 #else
             return cudaErrorUnknown;
@@ -797,13 +803,13 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
         case hipErrorUnmapFailed:
             return cudaErrorUnmapBufferObjectFailed;
         case hipErrorArrayIsMapped:
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= CUDA_10010
             return cudaErrorArrayIsMapped;
 #else
             return cudaErrorUnknown;
 #endif
         case hipErrorAlreadyMapped:
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= CUDA_10010
             return cudaErrorAlreadyMapped;
 #else
             return cudaErrorUnknown;
@@ -811,25 +817,25 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
         case hipErrorNoBinaryForGpu:
             return cudaErrorNoKernelImageForDevice;
         case hipErrorAlreadyAcquired:
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= CUDA_10010
             return cudaErrorAlreadyAcquired;
 #else
             return cudaErrorUnknown;
 #endif
         case hipErrorNotMapped:
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= CUDA_10010
             return cudaErrorNotMapped;
 #else
             return cudaErrorUnknown;
 #endif
         case hipErrorNotMappedAsArray:
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= CUDA_10010
             return cudaErrorNotMappedAsArray;
 #else
             return cudaErrorUnknown;
 #endif
         case hipErrorNotMappedAsPointer:
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= CUDA_10010
             return cudaErrorNotMappedAsPointer;
 #else
             return cudaErrorUnknown;
@@ -847,13 +853,13 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
         case hipErrorInvalidGraphicsContext:
             return cudaErrorInvalidGraphicsContext;
         case hipErrorInvalidSource:
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= CUDA_10010
             return cudaErrorInvalidSource;
 #else
             return cudaErrorUnknown;
 #endif
         case hipErrorFileNotFound:
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= CUDA_10010
             return cudaErrorFileNotFound;
 #else
             return cudaErrorUnknown;
@@ -867,7 +873,7 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
         case hipErrorIllegalState:
             return cudaErrorIllegalState;
         case hipErrorNotFound:
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= CUDA_10010
             return cudaErrorSymbolNotFound;
 #else
             return cudaErrorUnknown;
@@ -1073,9 +1079,11 @@ typedef enum cudaStreamCaptureStatus hipStreamCaptureStatus;
 #define hipStreamCaptureStatusActive cudaStreamCaptureStatusActive
 #define hipStreamCaptureStatusInvalidated cudaStreamCaptureStatusInvalidated
 
+#if CUDA_VERSION >= CUDA_11030
 typedef enum cudaStreamUpdateCaptureDependenciesFlags hipStreamUpdateCaptureDependenciesFlags;
 #define hipStreamAddCaptureDependencies cudaStreamAddCaptureDependencies
 #define hipStreamSetCaptureDependencies cudaStreamSetCaptureDependencies
+#endif
 
 /**
  * Stream CallBack struct
@@ -2419,12 +2427,14 @@ inline static hipError_t hipGraphAddMemcpyNode(hipGraphNode_t* pGraphNode, hipGr
         cudaGraphAddMemcpyNode(pGraphNode, graph, pDependencies, numDependencies, pCopyParams));
 }
 
+#if CUDA_VERSION >= CUDA_11010
 inline static hipError_t hipGraphAddMemcpyNode1D(hipGraphNode_t* pGraphNode, hipGraph_t graph,
                                    const hipGraphNode_t* pDependencies, size_t numDependencies,
                                    void* dst, const void* src, size_t count, hipMemcpyKind kind) {
     return hipCUDAErrorTohipError(
         cudaGraphAddMemcpyNode1D(pGraphNode, graph, pDependencies, numDependencies, dst, src, count, kind));
 }
+#endif
 
 inline static hipError_t hipGraphAddMemsetNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
                                                const hipGraphNode_t* pDependencies,
@@ -2590,12 +2600,14 @@ inline static hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hip
     return hipCUDAErrorTohipError(cudaGraphChildGraphNodeGetGraph(node, pGraph));
 }
 
+#if CUDA_VERSION >= CUDA_11010
 inline static hipError_t hipGraphExecChildGraphNodeSetParams(hipGraphExec_t hGraphExec,
                                                              hipGraphNode_t node,
                                                              hipGraph_t childGraph) {
     return hipCUDAErrorTohipError(
         cudaGraphExecChildGraphNodeSetParams(hGraphExec, node, childGraph));
 }
+#endif
 
 inline static hipError_t hipStreamGetCaptureInfo(hipStream_t stream,
                                                  hipStreamCaptureStatus* pCaptureStatus,
@@ -2603,6 +2615,7 @@ inline static hipError_t hipStreamGetCaptureInfo(hipStream_t stream,
     return hipCUDAErrorTohipError(cudaStreamGetCaptureInfo(stream, pCaptureStatus, pId));
 }
 
+#if CUDA_VERSION >= CUDA_11030
 inline static hipError_t hipStreamGetCaptureInfo_v2(
     hipStream_t stream, hipStreamCaptureStatus* captureStatus_out,
     unsigned long long* id_out __dparm(0), hipGraph_t* graph_out __dparm(0),
@@ -2610,12 +2623,14 @@ inline static hipError_t hipStreamGetCaptureInfo_v2(
     return hipCUDAErrorTohipError(cudaStreamGetCaptureInfo_v2(
         stream, captureStatus_out, id_out, graph_out, dependencies_out, numDependencies_out));
 }
+#endif
 
 inline static hipError_t hipStreamIsCapturing(hipStream_t stream,
                                               hipStreamCaptureStatus* pCaptureStatus) {
     return hipCUDAErrorTohipError(cudaStreamIsCapturing(stream, pCaptureStatus));
 }
 
+#if CUDA_VERSION >= CUDA_11030
 inline static hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream,
                                                             hipGraphNode_t* dependencies,
                                                             size_t numDependencies,
@@ -2623,7 +2638,9 @@ inline static hipError_t hipStreamUpdateCaptureDependencies(hipStream_t stream,
     return hipCUDAErrorTohipError(cudaStreamUpdateCaptureDependencies(stream, dependencies,
                                                                       numDependencies, flags));
 }
+#endif
 
+#if CUDA_VERSION >= CUDA_11010
 inline static hipError_t hipGraphAddEventRecordNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
                                                     const hipGraphNode_t* pDependencies,
                                                     size_t numDependencies, hipEvent_t event) {
@@ -2637,6 +2654,7 @@ inline static hipError_t hipGraphAddEventWaitNode(hipGraphNode_t* pGraphNode, hi
     return hipCUDAErrorTohipError(
         cudaGraphAddEventWaitNode(pGraphNode, graph, pDependencies, numDependencies, event));
 }
+#endif
 
 inline static hipError_t hipGraphAddHostNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
                                              const hipGraphNode_t* pDependencies,
@@ -2646,6 +2664,7 @@ inline static hipError_t hipGraphAddHostNode(hipGraphNode_t* pGraphNode, hipGrap
         cudaGraphAddHostNode(pGraphNode, graph, pDependencies, numDependencies, pNodeParams));
 }
 
+#if CUDA_VERSION >= CUDA_11010
 inline static hipError_t hipGraphAddMemcpyNodeFromSymbol(hipGraphNode_t* pGraphNode,
                                                          hipGraph_t graph,
                                                          const hipGraphNode_t* pDependencies,
@@ -2676,6 +2695,7 @@ inline static hipError_t hipGraphEventWaitNodeGetEvent(hipGraphNode_t node, hipE
 inline static hipError_t hipGraphEventWaitNodeSetEvent(hipGraphNode_t node, hipEvent_t event) {
     return hipCUDAErrorTohipError(cudaGraphEventWaitNodeSetEvent(node, event));
 }
+#endif
 
 inline static hipError_t hipGraphExecHostNodeSetParams(hipGraphExec_t hGraphExec,
                                                        hipGraphNode_t node,
@@ -2689,6 +2709,7 @@ inline static hipError_t hipGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphEx
     return hipCUDAErrorTohipError(cudaGraphExecMemcpyNodeSetParams(hGraphExec, node, pNodeParams));
 }
 
+#if CUDA_VERSION >= CUDA_11010
 inline static hipError_t hipGraphExecMemcpyNodeSetParams1D(hipGraphExec_t hGraphExec,
                                                            hipGraphNode_t node, void* dst,
                                                            const void* src, size_t count,
@@ -2712,6 +2733,7 @@ inline static hipError_t hipGraphExecMemcpyNodeSetParamsToSymbol(
     return hipCUDAErrorTohipError(cudaGraphExecMemcpyNodeSetParamsToSymbol(
         hGraphExec, node, symbol, src, count, offset, kind));
 }
+#endif
 
 inline static hipError_t hipGraphExecMemsetNodeSetParams(hipGraphExec_t hGraphExec,
                                                          hipGraphNode_t node,
@@ -2726,6 +2748,7 @@ inline static hipError_t hipGraphExecUpdate(hipGraphExec_t hGraphExec, hipGraph_
         cudaGraphExecUpdate(hGraphExec, hGraph, hErrorNode_out, updateResult_out));
 }
 
+#if CUDA_VERSION >= CUDA_11010
 inline static hipError_t hipGraphMemcpyNodeSetParamsFromSymbol(hipGraphNode_t node, void* dst,
                                                                const void* symbol, size_t count,
                                                                size_t offset, hipMemcpyKind kind) {
@@ -2745,12 +2768,14 @@ inline static hipError_t hipGraphEventRecordNodeGetEvent(hipGraphNode_t node,
                                                          hipEvent_t* event_out) {
     return hipCUDAErrorTohipError(cudaGraphEventRecordNodeGetEvent(node, event_out));
 }
+#endif
 
 inline static hipError_t hipGraphHostNodeGetParams(hipGraphNode_t node,
                                                    hipHostNodeParams* pNodeParams) {
     return hipCUDAErrorTohipError(cudaGraphHostNodeGetParams(node, pNodeParams));
 }
 
+#if CUDA_VERSION >= CUDA_11010
 inline static hipError_t hipGraphMemcpyNodeSetParams1D(hipGraphNode_t node, void* dst,
                                                        const void* src, size_t count,
                                                        hipMemcpyKind kind) {
@@ -2767,6 +2792,7 @@ inline static hipError_t hipGraphExecEventWaitNodeSetEvent(hipGraphExec_t hGraph
                                                            hipGraphNode_t hNode, hipEvent_t event) {
     return hipCUDAErrorTohipError(cudaGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event));
 }
+#endif
 
 inline static hipError_t hipGraphHostNodeSetParams(hipGraphNode_t node,
                                                    const hipHostNodeParams* pNodeParams) {

From e3aafb96362966abaa42fad90550361477764499 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Fri, 4 Feb 2022 12:53:39 +0000
Subject: [PATCH 035/177] SWDEV-315981 - make nvidia hiprtc functions static

Change-Id: I945426c155ed9ce21c308745f384bbf8669d73c4
---
 .../include/hip/nvidia_detail/nvidia_hiprtc.h | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h b/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
index db5657e6e6..68864e75c8 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2021 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -112,52 +112,52 @@ inline static hiprtcResult nvrtcResultTohiprtcResult(nvrtcResult result) {
   }
 }
 
-const char* hiprtcGetErrorString(hiprtcResult result) {
+inline static const char* hiprtcGetErrorString(hiprtcResult result) {
   return nvrtcGetErrorString(hiprtcResultTonvrtcResult(result));
 }
 
-hiprtcResult hiprtcVersion(int* major, int* minor) {
+inline static hiprtcResult hiprtcVersion(int* major, int* minor) {
   return nvrtcResultTohiprtcResult(nvrtcVersion(major, minor));
 }
 
 typedef nvrtcProgram hiprtcProgram;
 
-hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression) {
+inline static hiprtcResult hiprtcAddNameExpression(hiprtcProgram prog, const char* name_expression) {
   return nvrtcResultTohiprtcResult(nvrtcAddNameExpression(prog, name_expression));
 }
 
-hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions, const char** options) {
+inline static hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions, const char** options) {
   return nvrtcResultTohiprtcResult(nvrtcCompileProgram(prog, numOptions, options));
 }
 
-hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src, const char* name,
+inline static hiprtcResult hiprtcCreateProgram(hiprtcProgram* prog, const char* src, const char* name,
                                  int numHeaders, const char** headers, const char** includeNames) {
   return nvrtcResultTohiprtcResult(
       nvrtcCreateProgram(prog, src, name, numHeaders, headers, includeNames));
 }
 
-hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog) {
+inline static hiprtcResult hiprtcDestroyProgram(hiprtcProgram* prog) {
   return nvrtcResultTohiprtcResult(nvrtcDestroyProgram(prog));
 }
 
-hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog, const char* name_expression,
+inline static hiprtcResult hiprtcGetLoweredName(hiprtcProgram prog, const char* name_expression,
                                   const char** lowered_name) {
   return nvrtcResultTohiprtcResult(nvrtcGetLoweredName(prog, name_expression, lowered_name));
 }
 
-hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log) {
+inline static hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char* log) {
   return nvrtcResultTohiprtcResult(nvrtcGetProgramLog(prog, log));
 }
 
-hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet) {
+inline static hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t* logSizeRet) {
   return nvrtcResultTohiprtcResult(nvrtcGetProgramLogSize(prog, logSizeRet));
 }
 
-hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code) {
+inline static hiprtcResult hiprtcGetCode(hiprtcProgram prog, char* code) {
   return nvrtcResultTohiprtcResult(nvrtcGetPTX(prog, code));
 }
 
-hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet) {
+inline static hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t* codeSizeRet) {
   return nvrtcResultTohiprtcResult(nvrtcGetPTXSize(prog, codeSizeRet));
 }
 

From 974138cfd553dc1feeae3a7966170a3195c6bd0d Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Wed, 2 Feb 2022 15:51:09 +0000
Subject: [PATCH 036/177] EXSWCPHIPT-13 - Fix Segfault on Nvidia Platform

Change-Id: Ib85bb0b0e91b7703afc4858c4a5ada0115ad68f4
---
 .../include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index fdeb80f6bb..e6e9bd7a9b 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1220,6 +1220,11 @@ inline static hipError_t hipSetDevice(int device) {
 }
 
 inline static hipError_t hipChooseDevice(int* device, const hipDeviceProp_t* prop) {
+
+    if (prop == NULL) {
+      return hipErrorInvalidValue;
+    }
+
     struct cudaDeviceProp cdprop;
     memset(&cdprop, 0x0, sizeof(struct cudaDeviceProp));
     cdprop.major = prop->major;
@@ -1532,6 +1537,11 @@ inline static hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int  valu
 }
 
 inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int device) {
+
+    if (p_prop == NULL) {
+      return hipErrorInvalidValue;
+    }
+
     struct cudaDeviceProp cdprop;
     cudaError_t cerror;
     cerror = cudaGetDeviceProperties(&cdprop, device);

From d557ade2add40702894df5b557b3a7fb572c153c Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Mon, 14 Feb 2022 15:12:13 -0500
Subject: [PATCH 037/177] SWDEV-321698 - hipGraphNodeTypeMemcpy1D NVDIA build
 failure

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: I7aea11bf14adbeb7fea68eb862df74fcf6f13d75
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index e6e9bd7a9b..b3f24dc4b5 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1048,7 +1048,8 @@ typedef enum cudaGraphNodeType hipGraphNodeType;
 #define hipGraphNodeTypeEmpty cudaGraphNodeTypeEmpty
 #define hipGraphNodeTypeWaitEvent cudaGraphNodeTypeWaitEvent
 #define hipGraphNodeTypeEventRecord cudaGraphNodeTypeEventRecord
-#define hipGraphNodeTypeMemcpy1D cudaGraphNodeTypeMemcpy1D
+#define hipGraphNodeTypeExtSemaphoreSignal cudaGraphNodeTypeExtSemaphoreSignal
+#define hipGraphNodeTypeExtSemaphoreWait  cudaGraphNodeTypeExtSemaphoreWait
 #define hipGraphNodeTypeMemcpyFromSymbol cudaGraphNodeTypeMemcpyFromSymbol
 #define hipGraphNodeTypeMemcpyToSymbol cudaGraphNodeTypeMemcpyToSymbol
 #define hipGraphNodeTypeCount cudaGraphNodeTypeCount

From 16dbf85da0c94dd519c59768a1cbd48473643eae Mon Sep 17 00:00:00 2001
From: Shadi Dashmiz <Shadi.Dashmiz@amd.com>
Date: Mon, 7 Mar 2022 19:08:12 -0500
Subject: [PATCH 038/177] Revert "SWDEV-321698 - hipGraphNodeTypeMemcpy1D NVDIA
 build failure"

This reverts commit d557ade2add40702894df5b557b3a7fb572c153c.

Reason for revert: hip changes not in yet

Change-Id: I89e35d171cf01fb82d361279998efcc90e54ef33
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index b3f24dc4b5..e6e9bd7a9b 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1048,8 +1048,7 @@ typedef enum cudaGraphNodeType hipGraphNodeType;
 #define hipGraphNodeTypeEmpty cudaGraphNodeTypeEmpty
 #define hipGraphNodeTypeWaitEvent cudaGraphNodeTypeWaitEvent
 #define hipGraphNodeTypeEventRecord cudaGraphNodeTypeEventRecord
-#define hipGraphNodeTypeExtSemaphoreSignal cudaGraphNodeTypeExtSemaphoreSignal
-#define hipGraphNodeTypeExtSemaphoreWait  cudaGraphNodeTypeExtSemaphoreWait
+#define hipGraphNodeTypeMemcpy1D cudaGraphNodeTypeMemcpy1D
 #define hipGraphNodeTypeMemcpyFromSymbol cudaGraphNodeTypeMemcpyFromSymbol
 #define hipGraphNodeTypeMemcpyToSymbol cudaGraphNodeTypeMemcpyToSymbol
 #define hipGraphNodeTypeCount cudaGraphNodeTypeCount

From af18c3144fc3d655bcffdca0e10c8c201f33a391 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Wed, 9 Mar 2022 07:02:03 +0000
Subject: [PATCH 039/177] SWDEV-315118 - Add version check for
 hipGraphInstantiateWithFlags on nvidia path

Change-Id: I672b4a5287348bddf6eb8955decc9cce786275a2
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index e6e9bd7a9b..cb9b23aa6a 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -34,6 +34,7 @@ THE SOFTWARE.
 #define CUDA_10020 10020
 #define CUDA_11010 11010
 #define CUDA_11030 11030
+#define CUDA_11040 11040
 
 #ifdef __cplusplus
 extern "C" {
@@ -2412,11 +2413,12 @@ inline static hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGrap
         cudaGraphInstantiate(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize));
 }
 
+#if CUDA_VERSION >= CUDA_11040
 inline static hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph,
                                                       unsigned long long flags) {
     return hipCUDAErrorTohipError(cudaGraphInstantiateWithFlags(pGraphExec, graph, flags));
 }
-
+#endif
 inline static hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream) {
     return hipCUDAErrorTohipError(cudaGraphLaunch(graphExec, stream));
 }

From 3eca5176257648ba244d650af4dd91914118745c Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Thu, 13 Jan 2022 14:06:44 +0000
Subject: [PATCH 040/177] SWDEV-317716 - Add hipDeviceGetUuid API

Change-Id: I320c7bc11ddd7617e0246f6faf19135ad7363e73
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h     | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index cb9b23aa6a..3cf47338b8 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -2089,6 +2089,18 @@ inline static hipError_t hipDeviceGetName(char* name, int len, hipDevice_t devic
     return hipCUResultTohipError(cuDeviceGetName(name, len, device));
 }
 
+inline static hipError_t hipDeviceGetUuid(hipUUID* uuid, hipDevice_t device) {
+    if (uuid == NULL) {
+      return hipErrorInvalidValue;
+    }
+    struct CUuuid_st CUuid;
+    hipError_t err = hipCUResultTohipError(cuDeviceGetUuid(&CUuid, device));
+    if (err == hipSuccess) {
+      strncpy(uuid->bytes, CUuid.bytes, 16);
+    }
+    return err;
+}
+
 inline static hipError_t hipDeviceGetP2PAttribute(int* value, hipDeviceP2PAttr attr,
                                                   int srcDevice, int dstDevice) {
     return hipCUDAErrorTohipError(cudaDeviceGetP2PAttribute(value, attr, srcDevice, dstDevice));

From d5be2bb91d5950e639ac11f99e2156d7689952ec Mon Sep 17 00:00:00 2001
From: Rakesh Roy <rakesh.roy@amd.com>
Date: Thu, 24 Mar 2022 17:30:40 +0530
Subject: [PATCH 041/177] SWDEV-326789 - Add Missing HIP API Functions For
 Nvidia Platform

Change-Id: I828028c5d893ca1487f33017a6902c3b7f786762
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 3cf47338b8..1babb5578e 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -227,6 +227,7 @@ inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
 
 #define hipTexRef CUtexref
 #define hiparray CUarray
+typedef CUmipmappedArray hipMipmappedArray_t;
 
 // hipTextureAddressMode
 typedef enum cudaTextureAddressMode hipTextureAddressMode;
@@ -411,6 +412,9 @@ typedef struct cudaChannelFormatDesc hipChannelFormatDesc;
 typedef struct cudaResourceDesc hipResourceDesc;
 typedef struct cudaTextureDesc hipTextureDesc;
 typedef struct cudaResourceViewDesc hipResourceViewDesc;
+typedef CUDA_RESOURCE_DESC HIP_RESOURCE_DESC;
+typedef CUDA_TEXTURE_DESC HIP_TEXTURE_DESC;
+typedef CUDA_RESOURCE_VIEW_DESC HIP_RESOURCE_VIEW_DESC;
 // adding code for hipmemSharedConfig
 #define hipSharedMemBankSizeDefault cudaSharedMemBankSizeDefault
 #define hipSharedMemBankSizeFourByte cudaSharedMemBankSizeFourByte
@@ -1191,6 +1195,22 @@ inline static hipError_t hipFreeArray(hipArray* array) {
     return hipCUDAErrorTohipError(cudaFreeArray(array));
 }
 
+inline static hipError_t hipMipmappedArrayCreate(hipMipmappedArray_t* pHandle,
+                                                 HIP_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc,
+                                                 unsigned int numMipmapLevels) {
+    return hipCUResultTohipError(cuMipmappedArrayCreate(pHandle, pMipmappedArrayDesc, numMipmapLevels));
+}
+
+inline static hipError_t hipMipmappedArrayDestroy(hipMipmappedArray_t hMipmappedArray) {
+    return hipCUResultTohipError(cuMipmappedArrayDestroy(hMipmappedArray));
+}
+
+inline static hipError_t hipMipmappedArrayGetLevel(hipArray_t* pLevelArray,
+                                                   hipMipmappedArray_t hMipMappedArray,
+                                                   unsigned int level) {
+    return hipCUResultTohipError(cuMipmappedArrayGetLevel((CUarray*)pLevelArray, hMipMappedArray, level));
+}
+
 inline static hipError_t hipHostGetDevicePointer(void** devPtr, void* hostPtr, unsigned int flags) {
     return hipCUDAErrorTohipError(cudaHostGetDevicePointer(devPtr, hostPtr, flags));
 }
@@ -2357,6 +2377,29 @@ inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 bloc
             cudaLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim, blockDim, kernelParams, sharedMemBytes, stream));
 }
 
+inline static hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject,
+                                            const HIP_RESOURCE_DESC* pResDesc,
+                                            const HIP_TEXTURE_DESC* pTexDesc,
+                                            const HIP_RESOURCE_VIEW_DESC* pResViewDesc) {
+    return hipCUResultTohipError(cuTexObjectCreate((CUtexObject*)pTexObject, pResDesc, pTexDesc, pResViewDesc));
+}
+
+inline static hipError_t hipTexObjectDestroy(hipTextureObject_t texObject) {
+    return hipCUResultTohipError(cuTexObjectDestroy((CUtexObject)texObject));
+}
+
+inline static hipError_t hipTexObjectGetResourceDesc(HIP_RESOURCE_DESC* pResDesc, hipTextureObject_t texObject) {
+    return hipCUResultTohipError(cuTexObjectGetResourceDesc(pResDesc, (CUtexObject)texObject));
+}
+
+inline static hipError_t hipTexObjectGetResourceViewDesc(HIP_RESOURCE_VIEW_DESC* pResViewDesc, hipTextureObject_t texObject) {
+    return hipCUResultTohipError(cuTexObjectGetResourceViewDesc(pResViewDesc, (CUtexObject)texObject));
+}
+
+inline static hipError_t hipTexObjectGetTextureDesc(HIP_TEXTURE_DESC* pTexDesc, hipTextureObject_t texObject) {
+    return hipCUResultTohipError(cuTexObjectGetTextureDesc(pTexDesc, (CUtexObject)texObject));
+}
+
 inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipAddress_mode am){
     return hipCUResultTohipError(cuTexRefSetAddressMode(hTexRef,dim,am));
 }

From 3b89440eb1d1f65129d58d335b57a2c4c29c5834 Mon Sep 17 00:00:00 2001
From: German Andryeyev <German.Andryeyev@amd.com>
Date: Mon, 28 Mar 2022 12:30:04 -0400
Subject: [PATCH 042/177] SWDEV-311271 - Initial mempool implementation

HIP_MEM_POOL_SUPPORT controls memory pool support in runtime.
Currently it's disabled by default. The initial change doesn't
include: IPC, MGPU, virtual memory alloc, suballoc, defragmentation,
internal dependencies.

Change-Id: Ibed8528ebec698b045ebb247e49c0ecd6e587ed7
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 1babb5578e..6461e1e80a 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -33,6 +33,7 @@ THE SOFTWARE.
 #define CUDA_10010 10010
 #define CUDA_10020 10020
 #define CUDA_11010 11010
+#define CUDA_11010 11020
 #define CUDA_11030 11030
 #define CUDA_11040 11040
 
@@ -408,6 +409,15 @@ typedef struct cudaPos hipPos;
 #define hipStreamDefault cudaStreamDefault
 #define hipStreamNonBlocking cudaStreamNonBlocking
 
+typedef cudaMemPool_t hipMemPool_t;
+typedef enum cudaMemPoolAttr hipMemPoolAttr;
+typedef struct cudaMemLocation hipMemLocation;
+typedef struct cudaMemPoolProps hipMemPoolProps;
+typedef struct cudaMemAccessDesc hipMemAccessDesc;
+typedef enum cudaMemAccessFlags hipMemAccessFlags;
+typedef enum cudaMemAllocationHandleType hipMemAllocationHandleType;
+typedef struct cudaMemPoolPtrExportData hipMemPoolPtrExportData;
+
 typedef struct cudaChannelFormatDesc hipChannelFormatDesc;
 typedef struct cudaResourceDesc hipResourceDesc;
 typedef struct cudaTextureDesc hipTextureDesc;
@@ -1781,6 +1791,11 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
         case hipDeviceAttributeDirectManagedMemAccessFromHost:
             cdattr = cudaDevAttrDirectManagedMemAccessFromHost;
             break;
+#if CUDA_VERSION >= CUDA_11020
+        case hipDeviceAttributeMemoryPoolsSupported:
+            cdattr = cudaDevAttrMemoryPoolsSupported;
+            break;
+#endif // CUDA_VERSION >= CUDA_11020
         default:
             return hipCUDAErrorTohipError(cudaErrorInvalidValue);
     }
@@ -2284,6 +2299,100 @@ inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams*
     return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
 }
 
+#if CUDA_VERSION >= CUDA_11020
+// ========================== HIP Stream Ordered Memory Allocator =================================
+inline static hipError_t hipDeviceGetDefaultMemPool(hipMemPool_t* mem_pool, int device) {
+  return hipCUDAErrorTohipError(cudaDeviceGetDefaultMemPool(mem_pool, device));
+}
+
+inline static hipError_t hipDeviceSetMemPool(int device, hipMemPool_t mem_pool) {
+  return hipCUDAErrorTohipError(cudaDeviceSetMemPool(device, mem_pool));
+}
+
+inline static hipError_t hipDeviceGetMemPool(hipMemPool_t* mem_pool, int device) {
+  return hipCUDAErrorTohipError(cudaDeviceGetMemPool(mem_pool, device));
+}
+
+inline static hipError_t hipMallocAsync(void** dev_ptr, size_t size, hipStream_t stream) {
+  return hipCUDAErrorTohipError(cudaMallocAsync(dev_ptr, size, stream));
+}
+
+inline static hipError_t hipFreeAsync(void* dev_ptr, hipStream_t stream) {
+  return hipCUDAErrorTohipError(cudaFreeAsync(dev_ptr, stream));
+}
+
+inline static hipError_t hipMemPoolTrimTo(hipMemPool_t mem_pool, size_t min_bytes_to_hold) {
+  return hipCUDAErrorTohipError(cudaMemPoolTrimTo(mem_pool, min_bytes_to_hold));
+}
+
+inline static hipError_t hipMemPoolSetAttribute(hipMemPool_t mem_pool, hipMemPoolAttr attr, void* value) {
+  return hipCUDAErrorTohipError(cudaMemPoolSetAttribute(mem_pool, attr, value));
+}
+
+inline static hipError_t hipMemPoolGetAttribute(hipMemPool_t mem_pool, hipMemPoolAttr attr, void* value) {
+  return hipCUDAErrorTohipError(cudaMemPoolGetAttribute(mem_pool, attr, value));
+}
+
+inline static hipError_t hipMemPoolSetAccess(
+    hipMemPool_t mem_pool,
+    const hipMemAccessDesc* desc_list,
+    size_t count) {
+  return hipCUDAErrorTohipError(cudaMemPoolSetAccess(mem_pool, desc_list, count));
+}
+
+inline static hipError_t hipMemPoolGetAccess(
+    hipMemAccessFlags* flags,
+    hipMemPool_t mem_pool,
+    hipMemLocation* location) {
+  return hipCUDAErrorTohipError(cudaMemPoolGetAccess(flags, mem_pool, location));
+}
+
+inline static hipError_t hipMemPoolCreate(hipMemPool_t* mem_pool, const hipMemPoolProps* pool_props) {
+  return hipCUDAErrorTohipError(cudaMemPoolCreate(mem_pool, pool_props));
+}
+
+inline static hipError_t hipMemPoolDestroy(hipMemPool_t mem_pool) {
+  return hipCUDAErrorTohipError(cudaMemPoolDestroy(mem_pool));
+}
+
+inline static hipError_t hipMallocFromPoolAsync(
+    void** dev_ptr,
+    size_t size,
+    hipMemPool_t mem_pool,
+    hipStream_t stream) {
+  return hipCUDAErrorTohipError(cudaMallocFromPoolAsync(dev_ptr, size, mem_pool, stream));
+}
+
+inline static hipError_t hipMemPoolExportToShareableHandle(
+    void*                      shared_handle,
+    hipMemPool_t               mem_pool,
+    hipMemAllocationHandleType handle_type,
+    unsigned int               flags) {
+  return hipCUDAErrorTohipError(cudaMemPoolExportToShareableHandle(
+            shared_handle, mem_pool, handle_type, flags));
+}
+
+inline static hipError_t hipMemPoolImportFromShareableHandle(
+    hipMemPool_t*              mem_pool,
+    void*                      shared_handle,
+    hipMemAllocationHandleType handle_type,
+    unsigned int               flags) {
+  return hipCUDAErrorTohipError(cudaMemPoolImportFromShareableHandle(
+            mem_pool, shared_handle, handle_type, flags));
+}
+
+inline static hipError_t hipMemPoolExportPointer(hipMemPoolPtrExportData* export_data, void* ptr) {
+  return hipCUDAErrorTohipError(cudaMemPoolExportPointer(export_data, ptr));
+}
+
+inline static hipError_t hipMemPoolImportPointer(
+    void**                   ptr,
+    hipMemPool_t             mem_pool,
+    hipMemPoolPtrExportData* export_data) {
+  return hipCUDAErrorTohipError(cudaMemPoolImportPointer(ptr, mem_pool, export_data));
+}
+#endif // CUDA_VERSION >= CUDA_11020
+
 #ifdef __cplusplus
 }
 #endif

From 0f378ffc09aacea396dd0e5c813e94b74b5725ea Mon Sep 17 00:00:00 2001
From: Rakesh Roy <rakesh.roy@amd.com>
Date: Tue, 29 Mar 2022 17:47:31 +0530
Subject: [PATCH 043/177] SWDEV-326789 - Mark texture reference APIs as
 deprecated

Change-Id: I7a39cc1f036cd185ac3aab6c5f44730f16225f69
---
 .../include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 6461e1e80a..ebd5f501a7 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2509,11 +2509,11 @@ inline static hipError_t hipTexObjectGetTextureDesc(HIP_TEXTURE_DESC* pTexDesc,
     return hipCUResultTohipError(cuTexObjectGetTextureDesc(pTexDesc, (CUtexObject)texObject));
 }
 
-inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipAddress_mode am){
+__HIP_DEPRECATED inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipAddress_mode am){
     return hipCUResultTohipError(cuTexRefSetAddressMode(hTexRef,dim,am));
 }
 
-inline static hipError_t hipTexRefSetFilterMode(hipTexRef hTexRef, hipFilter_mode fm){
+__HIP_DEPRECATED inline static hipError_t hipTexRefSetFilterMode(hipTexRef hTexRef, hipFilter_mode fm){
     return hipCUResultTohipError(cuTexRefSetFilterMode(hTexRef,fm));
 }
 
@@ -2525,15 +2525,15 @@ inline static hipError_t hipTexRefSetAddress2D(hipTexRef hTexRef, const CUDA_ARR
     return hipCUResultTohipError(cuTexRefSetAddress2D(hTexRef,desc,dptr,Pitch));
 }
 
-inline static hipError_t hipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents){
+__HIP_DEPRECATED inline static hipError_t hipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents){
     return hipCUResultTohipError(cuTexRefSetFormat(hTexRef,fmt,NumPackedComponents));
 }
 
-inline static hipError_t hipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags){
+__HIP_DEPRECATED inline static hipError_t hipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags){
     return hipCUResultTohipError(cuTexRefSetFlags(hTexRef,Flags));
 }
 
-inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, hiparray hArray, unsigned int Flags){
+__HIP_DEPRECATED inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, hiparray hArray, unsigned int Flags){
     return hipCUResultTohipError(cuTexRefSetArray(hTexRef,hArray,Flags));
 }
 

From b712a51e746a00720752f2184151e909a2c0b1c4 Mon Sep 17 00:00:00 2001
From: Joseph Greathouse <Joseph.Greathouse@amd.com>
Date: Thu, 14 Apr 2022 20:11:52 -0500
Subject: [PATCH 044/177] SWDEV-332811 - Clean up and extend HIP unsafe atomic
 add

Update HIP's unsafeAtomicAdd to:
 - Compile properly even when not compiling for gfx90a
 - Fall back to safe atomic add on non-gfx90a architectures
 - use flat atomic add for FP64 on gfx90a, instead of dynamically
   checking memory spaces.

In addition, when the compiler is passed -munsafe-fp-atomics, it
will define __AMDGCN_UNSAFE_FP_ATOMICS__. When this happens, the
compiler is requesting that the HIP headers force all HIP
atomicAdd() calls on floats or doubles to use their unsafe versions.

This patch thus causes unsafeAtomicAdd() calls when that define
is seen. This call to unsafeAtomicAdd() is also done for atomicSub(),
since that calls atomicAdd underneath. This is not done for
system-scope atomicAdd because, on gfx90a, system-scope atomic FP
add instructions would need to target fine-grained memory, which is
always unsafe.

This patch also creates safeAtomicAdd() functions for float and double.
These functions will create a standalone safe atomic, even when the
application is compiled with -munsafe-fp-atomics.

Finally, this patch adds wrappers in the Nvidia path of HIP so that
these HIP functions call through to atomicAdd there as well.

Change-Id: I8af0621d3d28ea30c9278bfeea7393d03bbdac6d
---
 .../hip/nvidia_detail/nvidia_hip_runtime.h    |  1 +
 .../nvidia_detail/nvidia_hip_unsafe_atomics.h | 68 +++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
index 007fc70085..b1002b71dd 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
@@ -26,6 +26,7 @@ THE SOFTWARE.
 #include <cuda_runtime.h>
 
 #include <hip/hip_runtime_api.h>
+#include "nvidia_hip_unsafe_atomics.h"
 
 #define HIP_KERNEL_NAME(...) __VA_ARGS__
 
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h
new file mode 100644
index 0000000000..919353129a
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h
@@ -0,0 +1,68 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_UNSAFE_ATOMICS_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_UNSAFE_ATOMICS_H
+
+__device__ inline float unsafeAtomicAdd(float* addr, float value) {
+    return atomicAdd(addr, value);
+}
+
+__device__ inline double unsafeAtomicAdd(double* addr, double value) {
+#if __CUDA_ARCH__ < 600
+    unsigned long long *addr_cast = (unsigned long long*)addr;
+    unsigned long long old_val = *addr_cast;
+    unsigned long long expected;
+    do {
+        expected = old_val;
+        old_val = atomicCAS(addr_cast, expected,
+                            __double_as_longlong(value +
+                                                 __longlong_as_double(expected)));
+    } while (__double_as_longlong(expected) != __double_as_longlong(old_val));
+    return old_val;
+#else
+    return atomicAdd(addr, value);
+#endif
+}
+
+__device__ inline float safeAtomicAdd(float* addr, float value) {
+    return atomicAdd(addr, value);
+}
+
+__device__ inline double safeAtomicAdd(double* addr, double value) {
+#if __CUDA_ARCH__ < 600
+    unsigned long long *addr_cast = (unsigned long long*)addr;
+    unsigned long long old_val = *addr_cast;
+    unsigned long long expected;
+    do {
+        expected = old_val;
+        old_val = atomicCAS(addr_cast, expected,
+                            __double_as_longlong(value +
+                                                 __longlong_as_double(expected)));
+    } while (__double_as_longlong(expected) != __double_as_longlong(old_val));
+    return old_val;
+#else
+    return atomicAdd(addr, value);
+#endif
+}
+
+#endif

From 3fc1dca59ff2ba003fbd791a70702cd7a7e54db5 Mon Sep 17 00:00:00 2001
From: Tao Sang <tao.sang@amd.com>
Date: Mon, 9 May 2022 17:42:24 -0400
Subject: [PATCH 045/177] SWDEV-319818 - Release restriction on
 hipResourceViewDesc

Let more types support hipResourceViewDesc.
Add missing symbols to fix cuda build failing.

Change-Id: Ife694cc6491427093863252e257e820b0bb4fa0f
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 37 +++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index ebd5f501a7..b88548630a 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -281,6 +281,43 @@ typedef enum cudaMemRangeAttribute hipMemRangeAttribute;
 //
 // hipErrorNoDevice.
 
+// hipResourceViewFormat
+typedef enum cudaResourceViewFormat hipResourceViewFormat;
+#define hipResViewFormatNone cudaResViewFormatNone
+#define hipResViewFormatUnsignedChar1 cudaResViewFormatUnsignedChar1
+#define hipResViewFormatUnsignedChar2 cudaResViewFormatUnsignedChar2
+#define hipResViewFormatUnsignedChar4 cudaResViewFormatUnsignedChar4
+#define hipResViewFormatSignedChar1 cudaResViewFormatSignedChar1
+#define hipResViewFormatSignedChar2 cudaResViewFormatSignedChar2
+#define hipResViewFormatSignedChar4 cudaResViewFormatSignedChar4
+#define hipResViewFormatUnsignedShort1 cudaResViewFormatUnsignedShort1
+#define hipResViewFormatUnsignedShort2 cudaResViewFormatUnsignedShort2
+#define hipResViewFormatUnsignedShort4 cudaResViewFormatUnsignedShort4
+#define hipResViewFormatSignedShort1 cudaResViewFormatSignedShort1
+#define hipResViewFormatSignedShort2 cudaResViewFormatSignedShort2
+#define hipResViewFormatSignedShort4 cudaResViewFormatSignedShort4
+#define hipResViewFormatUnsignedInt1 cudaResViewFormatUnsignedInt1
+#define hipResViewFormatUnsignedInt2 cudaResViewFormatUnsignedInt2
+#define hipResViewFormatUnsignedInt4 cudaResViewFormatUnsignedInt4
+#define hipResViewFormatSignedInt1 cudaResViewFormatSignedInt1
+#define hipResViewFormatSignedInt2 cudaResViewFormatSignedInt2
+#define hipResViewFormatSignedInt4 cudaResViewFormatSignedInt4
+#define hipResViewFormatHalf1 cudaResViewFormatHalf1
+#define hipResViewFormatHalf2 cudaResViewFormatHalf2
+#define hipResViewFormatHalf4 cudaResViewFormatHalf4
+#define hipResViewFormatFloat1 cudaResViewFormatFloat1
+#define hipResViewFormatFloat2 cudaResViewFormatFloat2
+#define hipResViewFormatFloat4 cudaResViewFormatFloat4
+#define hipResViewFormatUnsignedBlockCompressed1 cudaResViewFormatUnsignedBlockCompressed1
+#define hipResViewFormatUnsignedBlockCompressed2 cudaResViewFormatUnsignedBlockCompressed2
+#define hipResViewFormatUnsignedBlockCompressed3 cudaResViewFormatUnsignedBlockCompressed3
+#define hipResViewFormatUnsignedBlockCompressed4 cudaResViewFormatUnsignedBlockCompressed4
+#define hipResViewFormatSignedBlockCompressed4 cudaResViewFormatSignedBlockCompressed4
+#define hipResViewFormatUnsignedBlockCompressed5 cudaResViewFormatUnsignedBlockCompressed5
+#define hipResViewFormatSignedBlockCompressed5 cudaResViewFormatSignedBlockCompressed5
+#define hipResViewFormatUnsignedBlockCompressed6H cudaResViewFormatUnsignedBlockCompressed6H
+#define hipResViewFormatSignedBlockCompressed6H cudaResViewFormatSignedBlockCompressed6H
+#define hipResViewFormatUnsignedBlockCompressed7 cudaResViewFormatUnsignedBlockCompressed7
 
 //! Flags that can be used with hipEventCreateWithFlags:
 #define hipEventDefault cudaEventDefault

From f874762659a981f90c181cc2e373423376f897cc Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Mon, 10 Jan 2022 11:02:49 -0500
Subject: [PATCH 046/177] SWDEV-318833 - Get and Set attribute for kernel nodes

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: Ie51aa4b56661cbb8c5b4eb1dbaad327377084ffb
Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h     | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index b88548630a..20d73aec6c 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1131,6 +1131,10 @@ typedef enum cudaStreamCaptureStatus hipStreamCaptureStatus;
 #define hipStreamCaptureStatusActive cudaStreamCaptureStatusActive
 #define hipStreamCaptureStatusInvalidated cudaStreamCaptureStatusInvalidated
 
+typedef union cudaKernelNodeAttrValue hipKernelNodeAttrValue;
+typedef enum  cudaKernelNodeAttrID hipKernelNodeAttrID;
+
+
 #if CUDA_VERSION >= CUDA_11030
 typedef enum cudaStreamUpdateCaptureDependenciesFlags hipStreamUpdateCaptureDependenciesFlags;
 #define hipStreamAddCaptureDependencies cudaStreamAddCaptureDependencies
@@ -2677,6 +2681,16 @@ inline static hipError_t hipGraphKernelNodeSetParams(hipGraphNode_t node,
     return hipCUDAErrorTohipError(cudaGraphKernelNodeSetParams(node, pNodeParams));
 }
 
+inline static hipError_t hipGraphKernelNodeSetAttribute(hipGraphNode_t hNode, hipKernelNodeAttrID attr,
+                                                        const hipKernelNodeAttrValue* value) {
+    return hipCUDAErrorTohipError(cudaGraphKernelNodeSetAttribute(hNode, attr, value));
+}
+
+inline static hipError_t hipGraphKernelNodeGetAttribute(hipGraphNode_t hNode, hipKernelNodeAttrID attr,
+                                                        hipKernelNodeAttrValue* value) {
+    return hipCUDAErrorTohipError(cudaGraphKernelNodeGetAttribute(hNode, attr, value));
+}
+
 inline static hipError_t hipGraphMemcpyNodeGetParams(hipGraphNode_t node,
                                                      hipMemcpy3DParms* pNodeParams) {
     return hipCUDAErrorTohipError(cudaGraphMemcpyNodeGetParams(node, pNodeParams));

From 9583567388c144ec6be6096e6b6043abb047cb3a Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Fri, 29 Apr 2022 16:14:29 -0400
Subject: [PATCH 047/177] SWDEV-321698 - remove Memcpy1D type

- Memcpy1D node type is not complying with cuda

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: If8113f5e699de0c62d98effc4580a2e0fee9a950
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 20d73aec6c..b9e095a676 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1100,7 +1100,6 @@ typedef enum cudaGraphNodeType hipGraphNodeType;
 #define hipGraphNodeTypeEmpty cudaGraphNodeTypeEmpty
 #define hipGraphNodeTypeWaitEvent cudaGraphNodeTypeWaitEvent
 #define hipGraphNodeTypeEventRecord cudaGraphNodeTypeEventRecord
-#define hipGraphNodeTypeMemcpy1D cudaGraphNodeTypeMemcpy1D
 #define hipGraphNodeTypeMemcpyFromSymbol cudaGraphNodeTypeMemcpyFromSymbol
 #define hipGraphNodeTypeMemcpyToSymbol cudaGraphNodeTypeMemcpyToSymbol
 #define hipGraphNodeTypeCount cudaGraphNodeTypeCount

From 9e3c65ccbd259fabfad45005f554e310f782d90b Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Mon, 4 Apr 2022 16:22:16 -0400
Subject: [PATCH 048/177] SWDEV-325711 - add Graph memory APIs skeleton

- hipDeviceGet/SetGraphMemAttr

- hipDeviceGraphMemTrim

- there is no memory pool for graphs currently

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: I11db76ea7ea1c7732175fc93264448052357e8dc
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h      | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index b9e095a676..008d734b50 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1133,6 +1133,7 @@ typedef enum cudaStreamCaptureStatus hipStreamCaptureStatus;
 typedef union cudaKernelNodeAttrValue hipKernelNodeAttrValue;
 typedef enum  cudaKernelNodeAttrID hipKernelNodeAttrID;
 
+typedef enum  cudaGraphMemAttributeType hipGraphMemAttributeType;
 
 #if CUDA_VERSION >= CUDA_11030
 typedef enum cudaStreamUpdateCaptureDependenciesFlags hipStreamUpdateCaptureDependenciesFlags;
@@ -3018,6 +3019,18 @@ inline static hipError_t hipGraphExecEventWaitNodeSetEvent(hipGraphExec_t hGraph
                                                            hipGraphNode_t hNode, hipEvent_t event) {
     return hipCUDAErrorTohipError(cudaGraphExecEventWaitNodeSetEvent(hGraphExec, hNode, event));
 }
+
+inline static hipError_t hipDeviceGetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value) {
+    return hipCUDAErrorTohipError(cudaDeviceGetGraphMemAttribute(device, attr, value));
+}
+
+inline static hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value) {
+    return hipCUDAErrorTohipError(cudaDeviceSetGraphMemAttribute(device, attr, value));
+}
+
+inline static hipError_t hipDeviceGraphMemTrim(int device) {
+    return hipCUDAErrorTohipError(cudaDeviceGraphMemTrim(device));
+}
 #endif
 
 inline static hipError_t hipGraphHostNodeSetParams(hipGraphNode_t node,

From cb53be08fccdfd72d2633559e192f9d40a7a965e Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Thu, 26 May 2022 11:11:01 +0530
Subject: [PATCH 049/177] SWDEV-338376 - Added hipMemGetAllocationGranularity,
 hipMemCreate and hipMemRelease for CUDA

Change-Id: If8c6a00958bb39752eb202c30a7960d13de225ae
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 43 +++++++++++++++++--
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 008d734b50..8ff26c7e88 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1140,7 +1140,26 @@ typedef enum cudaStreamUpdateCaptureDependenciesFlags hipStreamUpdateCaptureDepe
 #define hipStreamAddCaptureDependencies cudaStreamAddCaptureDependencies
 #define hipStreamSetCaptureDependencies cudaStreamSetCaptureDependencies
 #endif
-
+#if CUDA_VERSION >= CUDA_10020
+typedef struct CUmemAllocationProp_st hipMemAllocationProp;
+#define hipMemAllocationGranularityMinimum CU_MEM_ALLOC_GRANULARITY_MINIMUM
+#define hipMemAllocationGranularityRecommended CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
+typedef enum CUmemAllocationGranularity_flags_enum  hipMemAllocationGranularity_flags;
+//typedef struct CUmemLocation_st hipMemLocation;
+typedef enum CUmemLocationType_enum hipMemLocationType;
+#define hipMemLocationTypeInvalid CU_MEM_LOCATION_TYPE_INVALID
+#define hipMemLocationTypeDevice CU_MEM_LOCATION_TYPE_DEVICE
+//typedef enum CUmemAllocationHandleType_enum  hipMemAllocationHandleType;
+#define hipMemHandleTypeNone CU_MEM_HANDLE_TYPE_NONE
+#define hipMemHandleTypePosixFileDescriptor CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
+#define hipMemHandleTypeWin32 CU_MEM_HANDLE_TYPE_WIN32
+#define hipMemHandleTypeWin32Kmt CU_MEM_HANDLE_TYPE_WIN32_KMT
+typedef enum CUmemAllocationType_enum  hipMemAllocationType;
+#define hipMemAllocationTypeInvalid CU_MEM_ALLOCATION_TYPE_INVALID
+#define hipMemAllocationTypePinned CU_MEM_ALLOCATION_TYPE_PINNED
+#define hipMemAllocationTypeMax CU_MEM_ALLOCATION_TYPE_MAX
+#define hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle
+#endif
 /**
  * Stream CallBack struct
  */
@@ -1837,14 +1856,32 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
             cdattr = cudaDevAttrMemoryPoolsSupported;
             break;
 #endif // CUDA_VERSION >= CUDA_11020
+        case hipDeviceAttributeVirtualMemoryManagementSupported:
+            return hipCUResultTohipError(cuDeviceGetAttribute(pi,
+                                                              CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
+                                                              device));
         default:
             return hipCUDAErrorTohipError(cudaErrorInvalidValue);
     }
-
     cerror = cudaDeviceGetAttribute(pi, cdattr, device);
-
     return hipCUDAErrorTohipError(cerror);
 }
+#if CUDA_VERSION >= CUDA_10020
+inline static hipError_t hipMemGetAllocationGranularity(size_t* granularity,
+                                                        const hipMemAllocationProp* prop,
+                                                        hipMemAllocationGranularity_flags option) {
+    return hipCUResultTohipError(cuMemGetAllocationGranularity(granularity, prop, option));
+}
+inline static hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle,
+                                      size_t size,
+                                      const hipMemAllocationProp* prop,
+                                      unsigned long long flags) {
+    return hipCUResultTohipError(cuMemCreate(handle, size, prop, flags));
+}
+inline static hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle) {
+    return hipCUResultTohipError(cuMemRelease(handle));
+}
+#endif // CUDA_VERSION >= CUDA_10020
 
 inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
                                                                       const void* func,

From 70c90e597592188094725c77eceafd3d40d9ba4f Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Thu, 12 May 2022 12:40:23 +0000
Subject: [PATCH 050/177] SWDEV-336532 - Convert Managed Memory in
 hipPointerGetAttributes on Nvidia platform

Change-Id: I3bc6180040b6ad48f06ffdc7d01ca6fd9fe32c0e
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 8ff26c7e88..372a873e4d 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1949,6 +1949,9 @@ inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attribut
             case cudaMemoryTypeHost:
                 attributes->memoryType = hipMemoryTypeHost;
                 break;
+            case cudaMemoryTypeManaged:
+                attributes->memoryType = hipMemoryTypeManaged;
+                break;
             default:
                 return hipErrorInvalidValue;
         }

From 53190b0a657c0499807ad71766a6bed654846e78 Mon Sep 17 00:00:00 2001
From: jaypatel <jaydeepkumar.patel@amd.com>
Date: Fri, 29 Apr 2022 08:22:01 +0000
Subject: [PATCH 051/177] SWDEV-322688 - Added hip math constants header.

Change-Id: I9363f82159a53c6822e943c4950fc7f6c350b326
---
 .../nvidia_detail/nvidia_hip_math_constants.h | 62 +++++++++++++++++++
 1 file changed, 62 insertions(+)
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hip_math_constants.h

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_math_constants.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_math_constants.h
new file mode 100644
index 0000000000..7650bb0dec
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_math_constants.h
@@ -0,0 +1,62 @@
+/*
+Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef NVIDIA_HIP_MATH_CONSTANTS_H
+#define NVIDIA_HIP_MATH_CONSTANTS_H
+#include <math_constants.h>
+#define HIP_INF_F            CUDART_INF_F
+#define HIP_NAN_F            CUDART_NAN_F
+#define HIP_MIN_DENORM_F     CUDART_MIN_DENORM_F
+#define HIP_MAX_NORMAL_F     CUDART_MAX_NORMAL_F
+#define HIP_NEG_ZERO_F       CUDART_NEG_ZERO_F
+#define HIP_ZERO_F           CUDART_ZERO_F
+#define HIP_ONE_F            CUDART_ONE_F
+#define HIP_SQRT_HALF_F      CUDART_SQRT_HALF_F
+#define HIP_SQRT_HALF_HI_F   CUDART_SQRT_HALF_HI_F
+#define HIP_SQRT_HALF_LO_F   CUDART_SQRT_HALF_LO_F
+#define HIP_SQRT_TWO_F       CUDART_SQRT_TWO_F
+#define HIP_THIRD_F          CUDART_THIRD_F
+#define HIP_PIO4_F           CUDART_PIO4_F
+#define HIP_PIO2_F           CUDART_PIO2_F
+#define HIP_3PIO4_F          CUDART_3PIO4_F
+#define HIP_2_OVER_PI_F      CUDART_2_OVER_PI_F
+#define HIP_SQRT_2_OVER_PI_F CUDART_SQRT_2_OVER_PI_F
+#define HIP_PI_F             CUDART_PI_F
+#define HIP_L2E_F            CUDART_L2E_F
+#define HIP_L2T_F            CUDART_L2T_F
+#define HIP_LG2_F            CUDART_LG2_F
+#define HIP_LGE_F            CUDART_LGE_F
+#define HIP_LN2_F            CUDART_LN2_F
+#define HIP_LNT_F            CUDART_LNT_F
+#define HIP_LNPI_F           CUDART_LNPI_F
+#define HIP_TWO_TO_M126_F    CUDART_TWO_TO_M126_F
+#define HIP_TWO_TO_126_F     CUDART_TWO_TO_126_F
+#define HIP_NORM_HUGE_F      CUDART_NORM_HUGE_F
+#define HIP_TWO_TO_23_F      CUDART_TWO_TO_23_F
+#define HIP_TWO_TO_24_F      CUDART_TWO_TO_24_F
+#define HIP_TWO_TO_31_F      CUDART_TWO_TO_31_F
+#define HIP_TWO_TO_32_F      CUDART_TWO_TO_32_F
+#define HIP_REMQUO_BITS_F    CUDART_REMQUO_BITS_F
+#define HIP_REMQUO_MASK_F    CUDART_REMQUO_MASK_F
+#define HIP_TRIG_PLOSS_F     CUDART_TRIG_PLOSS_F
+#endif
+
+

From 95e6e11bf023c72156c922f25089071b3040b752 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Tue, 14 Jun 2022 04:29:41 +0000
Subject: [PATCH 052/177] SWDEV-340918 - make hip_runtime.h work with g++

Change-Id: Ief3e6dc075837dfccae3defad7da696130ed05bd
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
index b1002b71dd..19be62c6f6 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
@@ -26,7 +26,6 @@ THE SOFTWARE.
 #include <cuda_runtime.h>
 
 #include <hip/hip_runtime_api.h>
-#include "nvidia_hip_unsafe_atomics.h"
 
 #define HIP_KERNEL_NAME(...) __VA_ARGS__
 
@@ -77,6 +76,7 @@ typedef int hipLaunchParm;
 
 #ifdef __CUDACC__
 
+#include "nvidia_hip_unsafe_atomics.h"
 
 #define hipThreadIdx_x threadIdx.x
 #define hipThreadIdx_y threadIdx.y

From 328995043639c53b00e782b2076a8763d783f59d Mon Sep 17 00:00:00 2001
From: Sarbojit Sarkar <sarbojit.sarkar@amd.com>
Date: Thu, 23 Jun 2022 14:06:27 +0000
Subject: [PATCH 053/177] SWDEV-341174 - fixed warning on cuda

Change-Id: I5d017bcc154a2985e23893c0d933438bf0fc1958
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 372a873e4d..3ed8cf17ab 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2264,7 +2264,7 @@ inline static hipError_t hipModuleGetFunction(hipFunction_t* function, hipModule
 }
 
 inline static hipError_t hipModuleGetTexRef(hipTexRef* pTexRef, hipModule_t hmod, const char* name){
-    hipCUResultTohipError(cuModuleGetTexRef(pTexRef, hmod, name));
+    return hipCUResultTohipError(cuModuleGetTexRef(pTexRef, hmod, name));
 }
 
 inline static hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const void* func) {

From 36e047814b18381aae8728da1a17e47fdf60e0e2 Mon Sep 17 00:00:00 2001
From: Sarbojit Sarkar <sarbojit.sarkar@amd.com>
Date: Mon, 11 Jul 2022 13:05:59 +0000
Subject: [PATCH 054/177] SWDEV-345723 - Fixed test build issue

Change-Id: Id9545c9523bfad6b5afefcdc1f705d6043094ae4
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 3ed8cf17ab..a85877b5ae 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -349,6 +349,7 @@ typedef enum cudaResourceViewFormat hipResourceViewFormat;
 #define HIP_LAUNCH_PARAM_END CU_LAUNCH_PARAM_END
 #define hipLimitPrintfFifoSize cudaLimitPrintfFifoSize
 #define hipLimitMallocHeapSize cudaLimitMallocHeapSize
+#define hipLimitStackSize      cudaLimitStackSize
 #define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess
 
 #define hipOccupancyDefault cudaOccupancyDefault

From 038114c0989269108b1a6cf54c03aea24eff2124 Mon Sep 17 00:00:00 2001
From: Sarbojit Sarkar <sarbojit.sarkar@amd.com>
Date: Mon, 11 Jul 2022 07:48:30 +0000
Subject: [PATCH 055/177] SWDEV-341174 - Fixed typo

Change-Id: Ib605390bd17d014539d57488f0633db44d7c1fad
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a85877b5ae..f31d616bfb 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -33,7 +33,7 @@ THE SOFTWARE.
 #define CUDA_10010 10010
 #define CUDA_10020 10020
 #define CUDA_11010 11010
-#define CUDA_11010 11020
+#define CUDA_11020 11020
 #define CUDA_11030 11030
 #define CUDA_11040 11040
 

From e7f7073e96088a827e1eea24fd8e9fcd5d10884b Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Mon, 2 May 2022 12:25:30 -0400
Subject: [PATCH 056/177] SWDEV-321698 - add new enum entries to match cuda

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: If0a17b3234fd90b1f553a075e02280258000d36a
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index f31d616bfb..d6a8bc05cb 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1101,6 +1101,8 @@ typedef enum cudaGraphNodeType hipGraphNodeType;
 #define hipGraphNodeTypeEmpty cudaGraphNodeTypeEmpty
 #define hipGraphNodeTypeWaitEvent cudaGraphNodeTypeWaitEvent
 #define hipGraphNodeTypeEventRecord cudaGraphNodeTypeEventRecord
+#define hipGraphNodeTypeExtSemaphoreSignal cudaGraphNodeTypeExtSemaphoreSignal
+#define hipGraphNodeTypeExtSemaphoreWait  cudaGraphNodeTypeExtSemaphoreWait
 #define hipGraphNodeTypeMemcpyFromSymbol cudaGraphNodeTypeMemcpyFromSymbol
 #define hipGraphNodeTypeMemcpyToSymbol cudaGraphNodeTypeMemcpyToSymbol
 #define hipGraphNodeTypeCount cudaGraphNodeTypeCount

From 13d1a1b0e4eecbe35e5282b614b320a05dc314ae Mon Sep 17 00:00:00 2001
From: Brian Sumner <brian.sumner@amd.com>
Date: Mon, 11 Jul 2022 13:57:28 -0700
Subject: [PATCH 057/177] SWDEV-333033 - add safe and unsafe atomic min and max
 including gfx940 and add missing nvidia support

Change-Id: I829a0a5fd49c510e77eabbcb92d1a415ef6b5a4c
---
 .../hip/nvidia_detail/nvidia_hip_atomics.h    | 75 +++++++++++++++++++
 .../hip/nvidia_detail/nvidia_hip_runtime.h    |  1 +
 .../nvidia_detail/nvidia_hip_unsafe_atomics.h | 32 ++++++++
 3 files changed, 108 insertions(+)
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hip_atomics.h

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_atomics.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_atomics.h
new file mode 100644
index 0000000000..f9a92d582a
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_atomics.h
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_ATOMICS_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_ATOMICS_H
+
+
+__device__ inline float atomicMax(float* addr, float val) {
+    unsigned int *uaddr = (unsigned int *)addr;
+    float value = __uint_as_float(*uaddr);
+
+    while (value < val) {
+        value = __uint_as_float(atomicCAS(uaddr, __float_as_uint(value),
+                                                 __float_as_uint(val)));
+    }
+    return value;
+}
+
+__device__ inline double atomicMax(double* addr, double val) {
+    unsigned long long* uaddr  = (unsigned long long *)addr;
+    double value = __longlong_as_double(*uaddr);
+
+    while (value < val) {
+        value = __longlong_as_double(atomicCAS(uaddr,
+                                        __double_as_longlong(value),
+                                        __double_as_longlong(val)));
+    }
+
+    return value;
+}
+
+__device__ inline float atomicMin(float* addr, float val) {
+    unsigned int *uaddr = (unsigned int *)addr;
+    float value = __uint_as_float(*uaddr);
+
+    while (value > val) {
+        value = __uint_as_float(atomicCAS(uaddr, __float_as_uint(value),
+                                                 __float_as_uint(val)));
+    }
+    return value;
+}
+
+__device__ inline double atomicMin(double* addr, double val) {
+    unsigned long long* uaddr  = (unsigned long long *)addr;
+    double value = __longlong_as_double(*uaddr);
+
+    while (value > val) {
+        value = __longlong_as_double(atomicCAS(uaddr,
+                                         __double_as_longlong(value),
+                                         __double_as_longlong(val)));
+    }
+
+    return value;
+}
+
+#endif
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
index 19be62c6f6..c63e35700b 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
@@ -76,6 +76,7 @@ typedef int hipLaunchParm;
 
 #ifdef __CUDACC__
 
+#include "nvidia_hip_atomics.h"
 #include "nvidia_hip_unsafe_atomics.h"
 
 #define hipThreadIdx_x threadIdx.x
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h
index 919353129a..993f17507b 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_unsafe_atomics.h
@@ -44,6 +44,22 @@ __device__ inline double unsafeAtomicAdd(double* addr, double value) {
 #endif
 }
 
+__device__ inline float unsafeAtomicMax(float* addr, float value) {
+    return atomicMax(addr, value);
+}
+
+__device__ inline double unsafeAtomicMax(double* addr, double val) {
+    return atomicMax(addr, val);
+}
+
+__device__ inline float unsafeAtomicMin(float* addr, float value) {
+    return atomicMin(addr, value);
+}
+
+__device__ inline double unsafeAtomicMin(double* addr, double val) {
+    return atomicMin(addr, val);
+}
+
 __device__ inline float safeAtomicAdd(float* addr, float value) {
     return atomicAdd(addr, value);
 }
@@ -65,4 +81,20 @@ __device__ inline double safeAtomicAdd(double* addr, double value) {
 #endif
 }
 
+__device__ inline float safeAtomicMax(float* addr, float value) {
+    return atomicMax(addr, value);
+}
+
+__device__ inline double safeAtomicMax(double* addr, double val) {
+    return atomicMax(addr, val);
+}
+
+__device__ inline float safeAtomicMin(float* addr, float value) {
+    return atomicMin(addr, value);
+}
+
+__device__ inline double safeAtomicMin(double* addr, double val) {
+    return atomicMin(addr, val);
+}
+
 #endif

From b875936c5d0b31545b8afdb7f780f1c80e5565cf Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Mon, 18 Jul 2022 14:51:24 -0400
Subject: [PATCH 058/177] SWDEV-346448 - fix the missing defines

for nvdi compile

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: Ifd0e50a6f8636609fa61680cc719a2f82fb1320c
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index d6a8bc05cb..a0e05cf2dd 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1135,6 +1135,13 @@ typedef enum cudaStreamCaptureStatus hipStreamCaptureStatus;
 
 typedef union cudaKernelNodeAttrValue hipKernelNodeAttrValue;
 typedef enum  cudaKernelNodeAttrID hipKernelNodeAttrID;
+#define hipKernelNodeAttributeAccessPolicyWindow cudaKernelNodeAttributeAccessPolicyWindow
+#define hipKernelNodeAttributeCooperative cudaKernelNodeAttributeCooperative
+typedef enum cudaAccessProperty hipAccessProperty;
+#define hipAccessPropertyNormal cudaAccessPropertyNormal
+#define hipAccessPropertyStreaming cudaAccessPropertyStreaming
+#define hipAccessPropertyPersisting cudaAccessPropertyPersisting
+typedef struct cudaAccessPolicyWindow hipAccessPolicyWindow;
 
 typedef enum  cudaGraphMemAttributeType hipGraphMemAttributeType;
 

From a44d54f279cf7c061b2658dfd0565b8d7fbd19ea Mon Sep 17 00:00:00 2001
From: Rakesh Roy <rakesh.roy@amd.com>
Date: Mon, 4 Jul 2022 21:06:03 +0530
Subject: [PATCH 059/177] SWDEV-344135 - Enable stream ordered memory and
 mempool tests

- Implement CUDA mappings for stream ordered memory allocator and memory pool APIs

Change-Id: I2434118ff043527ec7c3389cd5175e1e21d032bf
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 97 ++++++++++++++++---
 1 file changed, 81 insertions(+), 16 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a0e05cf2dd..3ae9bc8150 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -449,10 +449,21 @@ typedef struct cudaPos hipPos;
 
 typedef cudaMemPool_t hipMemPool_t;
 typedef enum cudaMemPoolAttr hipMemPoolAttr;
+#define hipMemPoolReuseFollowEventDependencies cudaMemPoolReuseFollowEventDependencies
+#define hipMemPoolReuseAllowOpportunistic cudaMemPoolReuseAllowOpportunistic
+#define hipMemPoolReuseAllowInternalDependencies cudaMemPoolReuseAllowInternalDependencies
+#define hipMemPoolAttrReleaseThreshold cudaMemPoolAttrReleaseThreshold
+#define hipMemPoolAttrReservedMemCurrent cudaMemPoolAttrReservedMemCurrent
+#define hipMemPoolAttrReservedMemHigh cudaMemPoolAttrReservedMemHigh
+#define hipMemPoolAttrUsedMemCurrent cudaMemPoolAttrUsedMemCurrent
+#define hipMemPoolAttrUsedMemHigh cudaMemPoolAttrUsedMemHigh
 typedef struct cudaMemLocation hipMemLocation;
 typedef struct cudaMemPoolProps hipMemPoolProps;
 typedef struct cudaMemAccessDesc hipMemAccessDesc;
 typedef enum cudaMemAccessFlags hipMemAccessFlags;
+#define hipMemAccessFlagsProtNone cudaMemAccessFlagsProtNone
+#define hipMemAccessFlagsProtRead cudaMemAccessFlagsProtRead
+#define hipMemAccessFlagsProtReadWrite cudaMemAccessFlagsProtReadWrite
 typedef enum cudaMemAllocationHandleType hipMemAllocationHandleType;
 typedef struct cudaMemPoolPtrExportData hipMemPoolPtrExportData;
 
@@ -1151,24 +1162,59 @@ typedef enum cudaStreamUpdateCaptureDependenciesFlags hipStreamUpdateCaptureDepe
 #define hipStreamSetCaptureDependencies cudaStreamSetCaptureDependencies
 #endif
 #if CUDA_VERSION >= CUDA_10020
-typedef struct CUmemAllocationProp_st hipMemAllocationProp;
 #define hipMemAllocationGranularityMinimum CU_MEM_ALLOC_GRANULARITY_MINIMUM
 #define hipMemAllocationGranularityRecommended CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
 typedef enum CUmemAllocationGranularity_flags_enum  hipMemAllocationGranularity_flags;
-//typedef struct CUmemLocation_st hipMemLocation;
-typedef enum CUmemLocationType_enum hipMemLocationType;
-#define hipMemLocationTypeInvalid CU_MEM_LOCATION_TYPE_INVALID
-#define hipMemLocationTypeDevice CU_MEM_LOCATION_TYPE_DEVICE
-//typedef enum CUmemAllocationHandleType_enum  hipMemAllocationHandleType;
-#define hipMemHandleTypeNone CU_MEM_HANDLE_TYPE_NONE
-#define hipMemHandleTypePosixFileDescriptor CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR
-#define hipMemHandleTypeWin32 CU_MEM_HANDLE_TYPE_WIN32
-#define hipMemHandleTypeWin32Kmt CU_MEM_HANDLE_TYPE_WIN32_KMT
-typedef enum CUmemAllocationType_enum  hipMemAllocationType;
-#define hipMemAllocationTypeInvalid CU_MEM_ALLOCATION_TYPE_INVALID
-#define hipMemAllocationTypePinned CU_MEM_ALLOCATION_TYPE_PINNED
-#define hipMemAllocationTypeMax CU_MEM_ALLOCATION_TYPE_MAX
+typedef enum cudaMemLocationType hipMemLocationType;
+#define hipMemLocationTypeInvalid cudaMemLocationTypeInvalid
+#define hipMemLocationTypeDevice cudaMemLocationTypeDevice
+#define hipMemHandleTypeNone cudaMemHandleTypeNone
+#define hipMemHandleTypePosixFileDescriptor cudaMemHandleTypePosixFileDescriptor
+#define hipMemHandleTypeWin32 cudaMemHandleTypeWin32
+#define hipMemHandleTypeWin32Kmt cudaMemHandleTypeWin32Kmt
+typedef enum cudaMemAllocationType hipMemAllocationType;
+#define hipMemAllocationTypeInvalid cudaMemAllocationTypeInvalid
+#define hipMemAllocationTypePinned cudaMemAllocationTypePinned
+#define hipMemAllocationTypeMax cudaMemAllocationTypeMax
 #define hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle
+// Explicitely declaring hipMemAllocationProp based on CUmemAllocationProp but using CUDA runtime members instead
+// Because hipMemAllocationType, hipMemAllocationHandleType & hipMemLocation are defined using CUDA runtime data types & also used by hipMemPoolProps
+// Currently there doesn't exist CUDA inbuilt runtime structure corresponding to CUmemAllocationProp
+// Need to update this structure accordingly if CUDA updates CUmemAllocationProp
+typedef struct hipMemAllocationProp {
+    /** Memory allocation type */
+    hipMemAllocationType type;
+    /** Requested handle type */
+    hipMemAllocationHandleType requestedHandleTypes;
+    /** Location of allocation */
+    hipMemLocation location;
+    /**
+     * Windows-specific POBJECT_ATTRIBUTES required when
+     * ::CU_MEM_HANDLE_TYPE_WIN32 is specified.  This object atributes structure
+     * includes security attributes that define
+     * the scope of which exported allocations may be tranferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void *win32HandleMetaData;
+    struct {
+         /**
+         * Allocation hint for requesting compressible memory.
+         * On devices that support Compute Data Compression, compressible
+         * memory can be used to accelerate accesses to data with unstructured
+         * sparsity and other compressible data patterns. Applications are
+         * expected to query allocation property of the handle obtained with
+         * ::cuMemCreate using ::cuMemGetAllocationPropertiesFromHandle to
+         * validate if the obtained allocation is compressible or not. Note that
+         * compressed memory may not be mappable on all devices.
+         */
+         unsigned char compressionType;
+         /** RDMA capable */
+         unsigned char gpuDirectRDMACapable;
+         /** Bitmask indicating intended usage for this allocation */
+         unsigned short usage;
+         unsigned char reserved[4];
+    } allocFlags;
+} hipMemAllocationProp;
 #endif
 /**
  * Stream CallBack struct
@@ -1877,16 +1923,35 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
     return hipCUDAErrorTohipError(cerror);
 }
 #if CUDA_VERSION >= CUDA_10020
+inline static CUmemAllocationProp hipMemAllocationPropToCUmemAllocationProp(const hipMemAllocationProp* prop) {
+    CUmemAllocationProp cuProp;
+    cuProp.type = (CUmemAllocationType)prop->type;
+    cuProp.requestedHandleTypes = (CUmemAllocationHandleType)prop->requestedHandleTypes;
+    cuProp.location.type = (CUmemLocationType)prop->location.type;
+    cuProp.location.id = prop->location.id;
+    cuProp.win32HandleMetaData = prop->win32HandleMetaData;
+    cuProp.allocFlags.compressionType = prop->allocFlags.compressionType;
+    cuProp.allocFlags.gpuDirectRDMACapable = prop->allocFlags.gpuDirectRDMACapable;
+    cuProp.allocFlags.usage = prop->allocFlags.usage;
+    cuProp.allocFlags.reserved[0] = prop->allocFlags.reserved[0];
+    cuProp.allocFlags.reserved[1] = prop->allocFlags.reserved[1];
+    cuProp.allocFlags.reserved[2] = prop->allocFlags.reserved[2];
+    cuProp.allocFlags.reserved[3] = prop->allocFlags.reserved[3];
+    return cuProp;
+}
+
 inline static hipError_t hipMemGetAllocationGranularity(size_t* granularity,
                                                         const hipMemAllocationProp* prop,
                                                         hipMemAllocationGranularity_flags option) {
-    return hipCUResultTohipError(cuMemGetAllocationGranularity(granularity, prop, option));
+    CUmemAllocationProp cuProp = hipMemAllocationPropToCUmemAllocationProp(prop);
+    return hipCUResultTohipError(cuMemGetAllocationGranularity(granularity, &cuProp, option));
 }
 inline static hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle,
                                       size_t size,
                                       const hipMemAllocationProp* prop,
                                       unsigned long long flags) {
-    return hipCUResultTohipError(cuMemCreate(handle, size, prop, flags));
+    CUmemAllocationProp cuProp = hipMemAllocationPropToCUmemAllocationProp(prop);
+    return hipCUResultTohipError(cuMemCreate(handle, size, &cuProp, flags));
 }
 inline static hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle) {
     return hipCUResultTohipError(cuMemRelease(handle));

From bd1ffabfc1aaa9f50507599d49ef9bc3fb26fa39 Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Wed, 20 Jul 2022 13:33:03 -0400
Subject: [PATCH 060/177] SWDEV-347345 - add missing func for nvdia

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: Ie2a70778d01948083d724a9d20e1f17541178ec5
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 3ae9bc8150..d76e9812a2 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2826,6 +2826,10 @@ inline static hipError_t hipGraphMemsetNodeSetParams(hipGraphNode_t node,
     return hipCUDAErrorTohipError(cudaGraphMemsetNodeSetParams(node, pNodeParams));
 }
 
+inline static hipError_t hipThreadExchangeStreamCaptureMode(hipStreamCaptureMode* mode) {
+    return hipCUDAErrorTohipError(cudaThreadExchangeStreamCaptureMode(mode));
+}
+
 inline static hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec,
                                                          hipGraphNode_t node,
                                                          const hipKernelNodeParams* pNodeParams) {

From 013fa239881adc5349d25d25eb6ef1e8721f7a4c Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Fri, 22 Jul 2022 11:00:56 +0000
Subject: [PATCH 061/177] SWDEV-338376 - CUDA VMM API mappings

Change-Id: I47595dbf57fcce352d23842dbbc2b98b4ec97fb5
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 79 ++++++++++++++++++-
 1 file changed, 78 insertions(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index d76e9812a2..16a3a97101 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1177,6 +1177,16 @@ typedef enum cudaMemAllocationType hipMemAllocationType;
 #define hipMemAllocationTypePinned cudaMemAllocationTypePinned
 #define hipMemAllocationTypeMax cudaMemAllocationTypeMax
 #define hipMemGenericAllocationHandle_t CUmemGenericAllocationHandle
+//CUarrayMapInfo mappings
+typedef CUarrayMapInfo hipArrayMapInfo;
+typedef CUarraySparseSubresourceType hipArraySparseSubresourceType;
+#define hipArraySparseSubresourceTypeSparseLevel CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_SPARSE_LEVEL
+#define hipArraySparseSubresourceTypeMiptail CU_ARRAY_SPARSE_SUBRESOURCE_TYPE_MIPTAIL
+typedef CUmemOperationType hipMemOperationType;
+#define hipMemOperationTypeMap CU_MEM_OPERATION_TYPE_MAP
+#define hipMemOperationTypeUnmap CU_MEM_OPERATION_TYPE_UNMAP
+typedef CUmemHandleType hipMemHandleType;
+#define hipMemHandleTypeGeneric CU_MEM_HANDLE_TYPE_GENERIC
 // Explicitely declaring hipMemAllocationProp based on CUmemAllocationProp but using CUDA runtime members instead
 // Because hipMemAllocationType, hipMemAllocationHandleType & hipMemLocation are defined using CUDA runtime data types & also used by hipMemPoolProps
 // Currently there doesn't exist CUDA inbuilt runtime structure corresponding to CUmemAllocationProp
@@ -1939,7 +1949,19 @@ inline static CUmemAllocationProp hipMemAllocationPropToCUmemAllocationProp(cons
     cuProp.allocFlags.reserved[3] = prop->allocFlags.reserved[3];
     return cuProp;
 }
-
+inline static CUmemLocation hipMemLocationToCUmemLocation(const hipMemLocation* loc) {
+    CUmemLocation cuLoc;
+    cuLoc.id = loc->id;
+    cuLoc.type = (CUmemLocationType)loc->type;
+    return cuLoc;
+}
+inline static CUmemAccessDesc hipMemAccessDescToCUmemAccessDesc(const hipMemAccessDesc* desc) {
+    CUmemAccessDesc cuDesc;
+    cuDesc.flags = (CUmemAccess_flags)desc->flags;
+    cuDesc.location.id = (desc->location).id;
+    cuDesc.location.type = (CUmemLocationType)((desc->location).type);
+    return cuDesc;
+}
 inline static hipError_t hipMemGetAllocationGranularity(size_t* granularity,
                                                         const hipMemAllocationProp* prop,
                                                         hipMemAllocationGranularity_flags option) {
@@ -1956,6 +1978,61 @@ inline static hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle,
 inline static hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle) {
     return hipCUResultTohipError(cuMemRelease(handle));
 }
+inline static hipError_t hipMemAddressFree(hipDeviceptr_t ptr, size_t size) {
+    return hipCUResultTohipError(cuMemAddressFree(ptr, size));
+}
+inline static hipError_t hipMemAddressReserve(hipDeviceptr_t* ptr,
+                                              size_t size,
+                                              size_t alignment,
+                                              hipDeviceptr_t addr,
+                                              unsigned long long flags) {
+    return hipCUResultTohipError(cuMemAddressReserve(ptr, size, alignment, addr, flags));
+}
+inline static hipError_t hipMemExportToShareableHandle(void* shareableHandle,
+                                                       hipMemGenericAllocationHandle_t handle,
+                                                       hipMemAllocationHandleType handleType,
+                                                       unsigned long long flags) {
+    return hipCUResultTohipError(cuMemExportToShareableHandle(shareableHandle, handle, (CUmemAllocationHandleType)handleType, flags));
+}
+inline static hipError_t hipMemGetAccess(unsigned long long* flags,
+                                         const hipMemLocation* location,
+                                         hipDeviceptr_t ptr) {
+    CUmemLocation loc = hipMemLocationToCUmemLocation(location);
+    return hipCUResultTohipError(cuMemGetAccess(flags, &loc, ptr));
+}
+inline static hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop,
+                                                                 hipMemGenericAllocationHandle_t handle) {
+    CUmemAllocationProp cuProp = hipMemAllocationPropToCUmemAllocationProp(prop);
+    return hipCUResultTohipError(cuMemGetAllocationPropertiesFromHandle(&cuProp, handle));
+}
+inline static hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* handle,
+                                                         void* osHandle,
+                                                         hipMemAllocationHandleType shHandleType) {
+    return hipCUResultTohipError(cuMemImportFromShareableHandle(handle, osHandle, (CUmemAllocationHandleType)shHandleType));
+}
+inline static hipError_t hipMemMap(hipDeviceptr_t ptr, size_t size, size_t offset,
+                                   hipMemGenericAllocationHandle_t handle,
+                                   unsigned long long flags) {
+    return hipCUResultTohipError(cuMemMap(ptr, size, offset, handle, flags));
+}
+inline static hipError_t hipMemMapArrayAsync(hipArrayMapInfo* mapInfoList,
+                                             unsigned int  count,
+                                             hipStream_t stream) {
+    return hipCUResultTohipError(cuMemMapArrayAsync(mapInfoList, count, stream));
+}
+inline static hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHandle_t* handle,
+                                                      void* addr) {
+    return hipCUResultTohipError(cuMemRetainAllocationHandle(handle, addr));
+}
+inline static hipError_t hipMemSetAccess(hipDeviceptr_t ptr, size_t size,
+                                         const hipMemAccessDesc* desc,
+                                         size_t count) {
+    CUmemAccessDesc cuDesc = hipMemAccessDescToCUmemAccessDesc(desc);
+    return hipCUResultTohipError(cuMemSetAccess(ptr, size, &cuDesc, count));
+}
+inline static hipError_t hipMemUnmap(hipDeviceptr_t ptr, size_t size) {
+    return hipCUResultTohipError(cuMemUnmap(ptr, size));
+}
 #endif // CUDA_VERSION >= CUDA_10020
 
 inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,

From 5a485e33930ed8dac7394af4a55eab274f847b6a Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Thu, 30 Jun 2022 13:53:02 +0000
Subject: [PATCH 062/177] SWDEV-341992 - Add hipRefTex data types mapping on
 nvidia path

Change-Id: Icec2ec110f2647bfb4c571ff16dd3852cfc9fe4a
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 16a3a97101..6fdf4dc2d6 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -230,6 +230,11 @@ inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
 #define hiparray CUarray
 typedef CUmipmappedArray hipMipmappedArray_t;
 
+#define HIP_TRSA_OVERRIDE_FORMAT        CU_TRSA_OVERRIDE_FORMAT
+#define HIP_TRSF_READ_AS_INTEGER        CU_TRSF_READ_AS_INTEGER
+#define HIP_TRSF_NORMALIZED_COORDINATES CU_TRSF_NORMALIZED_COORDINATES
+#define HIP_TRSF_SRGB                   CU_TRSF_SRGB
+
 // hipTextureAddressMode
 typedef enum cudaTextureAddressMode hipTextureAddressMode;
 #define hipAddressModeWrap cudaAddressModeWrap

From d6cee2332a2d83870043d2d9178285d7dcd935ef Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Mon, 25 Apr 2022 13:42:17 -0400
Subject: [PATCH 063/177] SWDEV-325711: Add userobject functions for graph

- add user obj APIs for creating release and retain of user onbjects

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: I0bf2999c77e44269565b27c31c7c1461f8a160a2
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 6fdf4dc2d6..b6c0a93323 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1107,6 +1107,7 @@ inline static enum cudaChannelFormatKind hipChannelFormatKindToCudaChannelFormat
 typedef cudaGraph_t hipGraph_t;
 typedef cudaGraphNode_t hipGraphNode_t;
 typedef cudaGraphExec_t hipGraphExec_t;
+typedef cudaUserObject_t hipUserObject_t;
 
 typedef enum cudaGraphNodeType hipGraphNodeType;
 #define hipGraphNodeTypeKernel cudaGraphNodeTypeKernel
@@ -3232,6 +3233,29 @@ inline static hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAt
 inline static hipError_t hipDeviceGraphMemTrim(int device) {
     return hipCUDAErrorTohipError(cudaDeviceGraphMemTrim(device));
 }
+
+inline static hipError_t hipUserObjectCreate(hipUserObject_t* object_out, void* ptr, hipHostFn_t destroy,
+                                             unsigned int initialRefcount, unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaUserObjectCreate(object_out, ptr, destroy, initialRefcount, flags));
+}
+
+
+inline static hipError_t hipUserObjectRelease(hipUserObject_t object, unsigned int count __dparm(1)) {
+    return hipCUDAErrorTohipError(cudaUserObjectRelease(object, count));
+}
+
+
+inline static hipError_t hipUserObjectRetain(hipUserObject_t object, unsigned int count __dparm(1)) {
+    return hipCUDAErrorTohipError(cudaUserObjectRelease(object, count));
+}
+
+inline static hipError_t hipGraphRetainUserObject(hipGraph_t graph, hipUserObject_t object, unsigned int count __dparm(1), unsigned int flags __dparm(0)) {
+    return hipCUDAErrorTohipError(cudaGraphRetainUserObject(graph, object, count, flags));
+}
+
+inline static hipError_t hipGraphReleaseUserObject(hipGraph_t graph, hipUserObject_t object, unsigned int count __dparm(1)) {
+    return hipCUDAErrorTohipError(cudaGraphReleaseUserObject(graph, object, count));
+}
 #endif
 
 inline static hipError_t hipGraphHostNodeSetParams(hipGraphNode_t node,

From f7c0a424fb7f39f662f9cc69e86b679e2c11cf6a Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Thu, 25 Aug 2022 10:09:07 -0400
Subject: [PATCH 064/177] SWDEV-353672 - add missing defines for cuda

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: I8e2e0a3b8845d1920e6a40e3ed676c657ab90276
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index b6c0a93323..d38c7438e8 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1162,6 +1162,12 @@ typedef struct cudaAccessPolicyWindow hipAccessPolicyWindow;
 
 typedef enum  cudaGraphMemAttributeType hipGraphMemAttributeType;
 
+typedef enum cudaUserObjectFlags hipUserObjectFlags;
+#define hipUserObjectNoDestructorSync cudaUserObjectNoDestructorSync
+
+typedef enum cudaUserObjectRetainFlags hipUserObjectRetainFlags;
+#define hipGraphUserObjectMove cudaGraphUserObjectMove
+
 #if CUDA_VERSION >= CUDA_11030
 typedef enum cudaStreamUpdateCaptureDependenciesFlags hipStreamUpdateCaptureDependenciesFlags;
 #define hipStreamAddCaptureDependencies cudaStreamAddCaptureDependencies

From 17c0b1734645b19e42c46975ad15f4eaf0fee4df Mon Sep 17 00:00:00 2001
From: kjayapra-amd <karthik.jayaprakash@amd.com>
Date: Thu, 25 Aug 2022 11:57:30 -0700
Subject: [PATCH 065/177] SWDEV-356651 - Merge hipJit options between hiprtc
 and hip runtime.

Change-Id: I73b66eaf081ce63b6b7ce45f71880877953cf81c
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index d38c7438e8..884754d181 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -366,24 +366,24 @@ typedef enum cudaResourceViewFormat hipResourceViewFormat;
 
 
 // enum CUjit_option redefines
-#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS
-#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK
-#define hipJitOptionWallTime CU_JIT_WALL_TIME
-#define hipJitOptionInfoLogBuffer CU_JIT_INFO_LOG_BUFFER
-#define hipJitOptionInfoLogBufferSizeBytes CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
-#define hipJitOptionErrorLogBuffer CU_JIT_ERROR_LOG_BUFFER
-#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
-#define hipJitOptionOptimizationLevel CU_JIT_OPTIMIZATION_LEVEL
-#define hipJitOptionTargetFromContext CU_JIT_TARGET_FROM_CUCONTEXT
-#define hipJitOptionTarget CU_JIT_TARGET
-#define hipJitOptionFallbackStrategy CU_JIT_FALLBACK_STRATEGY
-#define hipJitOptionGenerateDebugInfo CU_JIT_GENERATE_DEBUG_INFO
-#define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE
-#define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO
-#define hipJitOptionCacheMode CU_JIT_CACHE_MODE
-#define hipJitOptionSm3xOpt CU_JIT_NEW_SM3X_OPT
-#define hipJitOptionFastCompile CU_JIT_FAST_COMPILE
-#define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS
+#define HIPRTC_JIT_MAX_REGISTERS CU_JIT_MAX_REGISTERS
+#define HIPRTC_JIT_THREADS_PER_BLOCK CU_JIT_THREADS_PER_BLOCK
+#define HIPRTC_JIT_WALL_TIME CU_JIT_WALL_TIME
+#define HIPRTC_JIT_INFO_LOG_BUFFER CU_JIT_INFO_LOG_BUFFER
+#define HIPRTC_JIT_INFO_LOG_BUFFER_SIZE_BYTES CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
+#define HIPRTC_JIT_ERROR_LOG_BUFFER CU_JIT_ERROR_LOG_BUFFER
+#define HIPRTC_JIT_ERROR_LOG_BUFFER_SIZE_BYTES CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+#define HIPRTC_JIT_OPTIMIZATION_LEVEL CU_JIT_OPTIMIZATION_LEVEL
+#define HIPRTC_JIT_TARGET_FROM_HIPCONTEXT CU_JIT_TARGET_FROM_CUCONTEXT
+#define HIPRTC_JIT_TARGET CU_JIT_TARGET
+#define HIPRTC_JIT_FALLBACK_STRATEGY CU_JIT_FALLBACK_STRATEGY
+#define HIPRTC_JIT_GENERATE_DEBUG_INFO CU_JIT_GENERATE_DEBUG_INFO
+#define HIPRTC_JIT_LOG_VERBOSE CU_JIT_LOG_VERBOSE
+#define HIPRTC_JIT_GENERATE_LINE_INFO CU_JIT_GENERATE_LINE_INFO
+#define HIPRTC_JIT_CACHE_MODE CU_JIT_CACHE_MODE
+#define HIPRTC_JIT_NEW_SM3X_OPT CU_JIT_NEW_SM3X_OPT
+#define HIPRTC_JIT_FAST_COMPILE CU_JIT_FAST_COMPILE
+#define HIPRTC_JIT_NUM_OPTIONS CU_JIT_NUM_OPTIONS
 
 typedef cudaEvent_t hipEvent_t;
 typedef cudaStream_t hipStream_t;

From 9ef5e96734f24907fb68793ac8e3c9d4e56704bd Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Mon, 3 Oct 2022 16:06:09 +0500
Subject: [PATCH 066/177] SWDEV-360001 - Add hipLaunchHostFunc in Nvidia path

Change-Id: I5c733b416642174a089a86c420b03e8775e206d6
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 884754d181..0067f8bffa 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3240,6 +3240,10 @@ inline static hipError_t hipDeviceGraphMemTrim(int device) {
     return hipCUDAErrorTohipError(cudaDeviceGraphMemTrim(device));
 }
 
+inline static hipError_t hipLaunchHostFunc(hipStream_t stream, hipHostFn_t fn, void* userData) {
+    return hipCUDAErrorTohipError(cudaLaunchHostFunc(stream, fn, userData));
+}
+
 inline static hipError_t hipUserObjectCreate(hipUserObject_t* object_out, void* ptr, hipHostFn_t destroy,
                                              unsigned int initialRefcount, unsigned int flags) {
     return hipCUDAErrorTohipError(cudaUserObjectCreate(object_out, ptr, destroy, initialRefcount, flags));

From 5f582f5615535ed5b50614a9933f3502868bbd3a Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Thu, 22 Sep 2022 09:24:16 +0000
Subject: [PATCH 067/177] SWDEV-356551 - Add support for device attributes.

Change-Id: Ic0f2b9cb510b027e0b8d1b3fb6d984e0d2600b15
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 0067f8bffa..109cae5452 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1929,6 +1929,33 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
         case hipDeviceAttributeDirectManagedMemAccessFromHost:
             cdattr = cudaDevAttrDirectManagedMemAccessFromHost;
             break;
+        case hipDeviceAttributeGlobalL1CacheSupported:
+            cdattr = cudaDevAttrGlobalL1CacheSupported;
+            break;
+        case hipDeviceAttributeMaxBlocksPerMultiProcessor:
+            cdattr = cudaDevAttrMaxBlocksPerMultiprocessor;
+            break;
+        case hipDeviceAttributeMultiGpuBoardGroupID:
+            cdattr = cudaDevAttrMultiGpuBoardGroupID;
+            break;
+        case hipDeviceAttributeReservedSharedMemPerBlock:
+            cdattr = cudaDevAttrReservedSharedMemoryPerBlock;
+            break;
+        case hipDeviceAttributeSingleToDoublePrecisionPerfRatio:
+            cdattr = cudaDevAttrSingleToDoublePrecisionPerfRatio;
+            break;
+        case hipDeviceAttributeStreamPrioritiesSupported:
+            cdattr = cudaDevAttrStreamPrioritiesSupported;
+            break;
+        case hipDeviceAttributeSurfaceAlignment:
+            cdattr = cudaDevAttrSurfaceAlignment;
+            break;
+        case hipDeviceAttributeTccDriver:
+            cdattr = cudaDevAttrTccDriver;
+            break;
+        case hipDeviceAttributeUnifiedAddressing:
+            cdattr = cudaDevAttrUnifiedAddressing;
+            break;
 #if CUDA_VERSION >= CUDA_11020
         case hipDeviceAttributeMemoryPoolsSupported:
             cdattr = cudaDevAttrMemoryPoolsSupported;
@@ -1938,6 +1965,21 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
             return hipCUResultTohipError(cuDeviceGetAttribute(pi,
                                                               CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED,
                                                               device));
+        case hipDeviceAttributeAccessPolicyMaxWindowSize:
+            cdattr = cudaDevAttrMaxAccessPolicyWindowSize;
+            break;
+        case hipDeviceAttributeAsyncEngineCount:
+            cdattr = cudaDevAttrAsyncEngineCount;
+            break;
+        case hipDeviceAttributeCanUseHostPointerForRegisteredMem:
+            cdattr = cudaDevAttrCanUseHostPointerForRegisteredMem;
+            break;
+        case hipDeviceAttributeComputePreemptionSupported:
+            cdattr = cudaDevAttrComputePreemptionSupported;
+            break;
+        case hipDeviceAttributeHostNativeAtomicSupported:
+            cdattr = cudaDevAttrHostNativeAtomicSupported;
+            break;
         default:
             return hipCUDAErrorTohipError(cudaErrorInvalidValue);
     }

From 4d63e5cabef2cfffacbdd537bc30e8e612934557 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Wed, 12 Oct 2022 09:25:30 +0000
Subject: [PATCH 068/177] SWDEV-349226 - Add the hipDataTypes mapping to
 cudaDataTypes

Change-Id: Iadeb2675adf314c02dcc5d722581678ec36f8632
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 34 +++++++++++++++----
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 109cae5452..0a6be9d975 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -93,12 +93,34 @@ typedef enum hipMemoryAdvise {
 
 // hipDataType
 #define hipDataType cudaDataType
-#define HIP_R_16F CUDA_R_16F
-#define HIP_R_32F CUDA_R_32F
-#define HIP_R_64F CUDA_R_64F
-#define HIP_C_16F CUDA_C_16F
-#define HIP_C_32F CUDA_C_32F
-#define HIP_C_64F CUDA_C_64F
+#define HIP_R_16F  CUDA_R_16F
+#define HIP_C_16F  CUDA_C_16F
+#define HIP_R_16BF CUDA_R_16BF
+#define HIP_C_16BF CUDA_C_16BF
+#define HIP_R_32F  CUDA_R_32F
+#define HIP_C_32F  CUDA_C_32F
+#define HIP_R_64F  CUDA_R_64F
+#define HIP_C_64F  CUDA_C_64F
+#define HIP_R_4I   CUDA_R_4I
+#define HIP_C_4I   CUDA_C_4I
+#define HIP_R_4U   CUDA_R_4U
+#define HIP_C_4U   CUDA_C_4U
+#define HIP_R_8I   CUDA_R_8I
+#define HIP_C_8I   CUDA_C_8I
+#define HIP_R_8U   CUDA_R_8U
+#define HIP_C_8U   CUDA_C_8U
+#define HIP_R_16I  CUDA_R_16I
+#define HIP_C_16I  CUDA_C_16I
+#define HIP_R_16U  CUDA_R_16U
+#define HIP_C_16U  CUDA_C_16U
+#define HIP_R_32I  CUDA_R_32I
+#define HIP_C_32I  CUDA_C_32I
+#define HIP_R_32U  CUDA_R_32U
+#define HIP_C_32U  CUDA_C_32U
+#define HIP_R_64I  CUDA_R_64I
+#define HIP_C_64I  CUDA_C_64I
+#define HIP_R_64U  CUDA_R_64U
+#define HIP_C_64U  CUDA_C_64U
 
 // hip stream operation masks
 #define STREAM_OPS_WAIT_MASK_32 0xFFFFFFFF

From b8532f6dd43888d82ff034cc24b80b7ef9a1e206 Mon Sep 17 00:00:00 2001
From: pghafari <pghafari@amd.com>
Date: Fri, 14 Oct 2022 16:53:13 -0400
Subject: [PATCH 069/177] SWDEV-362199 - add vulkan interop apis in nvidia

Change-Id: I6c5de6174a80be862b6d869927ed704dd1421b41
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 44 ++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 0a6be9d975..2ff8079c39 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1122,6 +1122,16 @@ inline static enum cudaChannelFormatKind hipChannelFormatKindToCudaChannelFormat
     }
 }
 
+typedef enum cudaExternalMemoryHandleType hipExternalMemoryHandleType;
+typedef struct cudaExternalMemoryHandleDesc hipExternalMemoryHandleDesc;
+typedef struct cudaExternalMemoryBufferDesc hipExternalMemoryBufferDesc;
+typedef cudaExternalMemory_t hipExternalMemory_t;
+typedef enum cudaExternalSemaphoreHandleType hipExternalSemaphoreHandleType;
+typedef struct cudaExternalSemaphoreHandleDesc hipExternalSemaphoreHandleDesc;
+typedef cudaExternalSemaphore_t hipExternalSemaphore_t;
+typedef struct cudaExternalSemaphoreSignalParams hipExternalSemaphoreSignalParams;
+typedef struct cudaExternalSemaphoreWaitParams hipExternalSemaphoreWaitParams;
+
 /**
  * graph types
  *
@@ -2128,7 +2138,7 @@ inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(i
                                                       blockSize, dynamicSMemSize, flags));
 }
 
-inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks, 
+inline static hipError_t hipModuleOccupancyMaxActiveBlocksPerMultiprocessor(int* numBlocks,
                                                                  hipFunction_t f,
                                                                  int  blockSize,
                                                                  size_t dynamicSMemSize ){
@@ -2608,6 +2618,38 @@ inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams*
     return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
 }
 
+inline static hipError_t hipImportExternalSemaphore(hipExternalSemaphore_t* extSem_out,
+                                      const hipExternalSemaphoreHandleDesc* semHandleDesc) {
+  return hipCUDAErrorTohipError(cudaImportExternalSemaphore(extSem_out,(const struct cudaExternalSemaphoreHandleDesc*)semHandleDesc));
+}
+
+inline static hipError_t hipSignalExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemArray,
+                                            const hipExternalSemaphoreSignalParams* paramsArray,
+                                            unsigned int numExtSems, hipStream_t stream) {
+  return hipCUDAErrorTohipError(cudaSignalExternalSemaphoresAsync(extSemArray, (const struct cudaExternalSemaphoreSignalParams*)paramsArray, numExtSems, stream));
+}
+inline static hipError_t hipWaitExternalSemaphoresAsync(const hipExternalSemaphore_t* extSemArray,
+                                              const hipExternalSemaphoreWaitParams* paramsArray,
+                                              unsigned int numExtSems, hipStream_t stream) {
+  return hipCUDAErrorTohipError(cudaWaitExternalSemaphoresAsync(extSemArray, (const struct cudaExternalSemaphoreWaitParams*)paramsArray, numExtSems, stream));
+}
+
+inline static hipError_t hipDestroyExternalSemaphore(hipExternalSemaphore_t extSem) {
+  return hipCUDAErrorTohipError(cudaDestroyExternalSemaphore(extSem));
+}
+
+inline static hipError_t hipImportExternalMemory(hipExternalMemory_t* extMem_out, const hipExternalMemoryHandleDesc* memHandleDesc) {
+  return hipCUDAErrorTohipError(cudaImportExternalMemory(extMem_out, (const struct cudaExternalMemoryHandleDesc*)memHandleDesc));
+}
+
+inline static hipError_t hipExternalMemoryGetMappedBuffer(void **devPtr, hipExternalMemory_t extMem, const hipExternalMemoryBufferDesc *bufferDesc) {
+  return hipCUDAErrorTohipError(cudaExternalMemoryGetMappedBuffer(devPtr, extMem, (const struct cudaExternalMemoryBufferDesc*)bufferDesc));
+}
+
+inline static hipError_t hipDestroyExternalMemory(hipExternalMemory_t extMem) {
+  return hipCUDAErrorTohipError(cudaDestroyExternalMemory(extMem));
+}
+
 #if CUDA_VERSION >= CUDA_11020
 // ========================== HIP Stream Ordered Memory Allocator =================================
 inline static hipError_t hipDeviceGetDefaultMemPool(hipMemPool_t* mem_pool, int device) {

From 6ce46810d90fdf8e13e19c6572b91ebe0891d288 Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Mon, 14 Nov 2022 12:29:15 +0000
Subject: [PATCH 070/177] SWDEV-366088 - Default case returns -1 as enum value.

Change-Id: I8dd5cac254c99e4de9f6d492e71060e39b069507
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h   | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 2ff8079c39..3397682b26 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1026,8 +1026,10 @@ inline static enum cudaMemcpyKind hipMemcpyKindToCudaMemcpyKind(hipMemcpyKind ki
             return cudaMemcpyDeviceToHost;
         case hipMemcpyDeviceToDevice:
             return cudaMemcpyDeviceToDevice;
-        default:
+        case hipMemcpyDefault:
             return cudaMemcpyDefault;
+        default:
+            return (hipMemcpyKind)-1;
     }
 }
 
@@ -1043,7 +1045,7 @@ inline static enum cudaTextureAddressMode hipTextureAddressModeToCudaTextureAddr
         case hipAddressModeBorder:
             return cudaAddressModeBorder;
         default:
-            return cudaAddressModeWrap;
+            return (hipTextureAddressMode)-1;
     }
 }
 
@@ -1059,7 +1061,7 @@ inline static enum cudaMemRangeAttribute hipMemRangeAttributeToCudaMemRangeAttri
        case hipMemRangeAttributeLastPrefetchLocation:
            return cudaMemRangeAttributeLastPrefetchLocation;
        default:
-           return cudaMemRangeAttributeReadMostly;
+           return (hipMemRangeAttribute)-1;
    }
 }
 
@@ -1079,7 +1081,7 @@ inline static enum cudaMemoryAdvise hipMemoryAdviseTocudaMemoryAdvise(
        case hipMemAdviseUnsetAccessedBy:
            return cudaMemAdviseUnsetAccessedBy;
        default:
-           return cudaMemAdviseSetReadMostly;
+           return (enum cudaMemoryAdvise)-1;
    }
 }
 
@@ -1091,7 +1093,7 @@ inline static enum cudaTextureFilterMode hipTextureFilterModeToCudaTextureFilter
         case hipFilterModeLinear:
             return cudaFilterModeLinear;
         default:
-            return cudaFilterModePoint;
+            return (hipTextureFilterMode)-1;
     }
 }
 
@@ -1102,7 +1104,7 @@ inline static enum cudaTextureReadMode hipTextureReadModeToCudaTextureReadMode(h
         case hipReadModeNormalizedFloat:
             return cudaReadModeNormalizedFloat;
         default:
-            return cudaReadModeElementType;
+            return (hipTextureReadMode)-1;
     }
 }
 
@@ -1118,7 +1120,7 @@ inline static enum cudaChannelFormatKind hipChannelFormatKindToCudaChannelFormat
         case hipChannelFormatKindNone:
             return cudaChannelFormatKindNone;
         default:
-            return cudaChannelFormatKindNone;
+            return (hipChannelFormatKind)-1;
     }
 }
 

From b95b2ba704d6d4398eb54401554f9965356f1da2 Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <anusha.godavarthysurya@amd.com>
Date: Thu, 24 Nov 2022 18:04:36 +0000
Subject: [PATCH 071/177] SWDEV-325711 - Added few graph API support for nvidia
 path

Change-Id: I4eb6e77220e201c85fdaa3a004590c4c5d5ea6c6
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 43 +++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 3397682b26..e30d27505b 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -36,6 +36,7 @@ THE SOFTWARE.
 #define CUDA_11020 11020
 #define CUDA_11030 11030
 #define CUDA_11040 11040
+#define CUDA_11060 11060
 
 #ifdef __cplusplus
 extern "C" {
@@ -1207,6 +1208,21 @@ typedef enum cudaStreamUpdateCaptureDependenciesFlags hipStreamUpdateCaptureDepe
 #define hipStreamAddCaptureDependencies cudaStreamAddCaptureDependencies
 #define hipStreamSetCaptureDependencies cudaStreamSetCaptureDependencies
 #endif
+
+#if CUDA_VERSION >= CUDA_11030
+typedef enum cudaGraphDebugDotFlags hipGraphDebugDotFlags;
+#define hipGraphDebugDotFlagsVerbose cudaGraphDebugDotFlagsVerbose
+#define hipGraphDebugDotFlagsKernelNodeParams cudaGraphDebugDotFlagsKernelNodeParams
+#define hipGraphDebugDotFlagsMemcpyNodeParams cudaGraphDebugDotFlagsMemcpyNodeParams
+#define hipGraphDebugDotFlagsMemsetNodeParams cudaGraphDebugDotFlagsMemsetNodeParams
+#define hipGraphDebugDotFlagsHostNodeParams cudaGraphDebugDotFlagsHostNodeParams
+#define hipGraphDebugDotFlagsEventNodeParams cudaGraphDebugDotFlagsEventNodeParams
+#define hipGraphDebugDotFlagsExtSemasSignalNodeParams cudaGraphDebugDotFlagsExtSemasSignalNodeParams
+#define hipGraphDebugDotFlagsExtSemasWaitNodeParams cudaGraphDebugDotFlagsExtSemasWaitNodeParams
+#define hipGraphDebugDotFlagsKernelNodeAttributes cudaGraphDebugDotFlagsKernelNodeAttributes
+#define hipGraphDebugDotFlagsHandles cudaGraphDebugDotFlagsHandles
+#endif
+
 #if CUDA_VERSION >= CUDA_10020
 #define hipMemAllocationGranularityMinimum CU_MEM_ALLOC_GRANULARITY_MINIMUM
 #define hipMemAllocationGranularityRecommended CU_MEM_ALLOC_GRANULARITY_RECOMMENDED
@@ -3380,7 +3396,34 @@ inline static hipError_t hipGraphHostNodeSetParams(hipGraphNode_t node,
                                                    const hipHostNodeParams* pNodeParams) {
     return hipCUDAErrorTohipError(cudaGraphHostNodeSetParams(node, pNodeParams));
 }
+#if CUDA_VERSION >= CUDA_11030
+inline static hipError_t hipGraphDebugDotPrint(hipGraph_t graph, const char* path,
+                                               unsigned int flags) {
+    return hipCUDAErrorTohipError(cudaGraphDebugDotPrint(graph, path, flags));
+}
+#endif
+#if CUDA_VERSION >= CUDA_11000
+inline static hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc,
+                                                          hipGraphNode_t hDst) {
+    return hipCUDAErrorTohipError(cudaGraphKernelNodeCopyAttributes(hSrc, hDst));
+}
+#endif
+#if CUDA_VERSION >= CUDA_11060
+inline static hipError_t hipGraphNodeSetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                                unsigned int isEnabled) {
+    return hipCUDAErrorTohipError(cudaGraphNodeSetEnabled(hGraphExec, hNode, isEnabled));
+}
 
+inline static hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+                                                unsigned int* isEnabled) {
+    return hipCUDAErrorTohipError(cudaGraphNodeGetEnabled(hGraphExec, hNode, isEnabled));
+}
+#endif
+#if CUDA_VERSION >= CUDA_11010
+inline static hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t stream) {
+    return hipCUDAErrorTohipError(cudaGraphUpload(graphExec, stream));
+}
+#endif
 #endif  //__CUDACC__
 
 #endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H

From 9f1655969b3e4cdfdb9a37d433f6736058724356 Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Mon, 28 Nov 2022 13:34:08 +0000
Subject: [PATCH 072/177] SWDEV-369557 - Added enum mapping for
 external[Semaphore/Memory]HandleType

Change-Id: If1b95b084f7fa312e6cdad4ccf433127671a35ce
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 27 +++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index e30d27505b..cd43a6265a 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1126,10 +1126,37 @@ inline static enum cudaChannelFormatKind hipChannelFormatKindToCudaChannelFormat
 }
 
 typedef enum cudaExternalMemoryHandleType hipExternalMemoryHandleType;
+#define hipExternalMemoryHandleTypeOpaqueFd cudaExternalMemoryHandleTypeOpaqueFd
+#define hipExternalMemoryHandleTypeOpaqueWin32 cudaExternalMemoryHandleTypeOpaqueWin32
+#define hipExternalMemoryHandleTypeOpaqueWin32Kmt cudaExternalMemoryHandleTypeOpaqueWin32Kmt
+#define hipExternalMemoryHandleTypeD3D12Heap cudaExternalMemoryHandleTypeD3D12Heap
+#define hipExternalMemoryHandleTypeD3D12Resource cudaExternalMemoryHandleTypeD3D12Resource
+#if CUDA_VERSION >= CUDA_10020
+#define hipExternalMemoryHandleTypeD3D11Resource cudaExternalMemoryHandleTypeD3D11Resource
+#define hipExternalMemoryHandleTypeD3D11ResourceKmt cudaExternalMemoryHandleTypeD3D11ResourceKmt
+#define hipExternalMemoryHandleTypeNvSciBuf cudaExternalMemoryHandleTypeNvSciBuf
+#endif
+
 typedef struct cudaExternalMemoryHandleDesc hipExternalMemoryHandleDesc;
 typedef struct cudaExternalMemoryBufferDesc hipExternalMemoryBufferDesc;
 typedef cudaExternalMemory_t hipExternalMemory_t;
+
 typedef enum cudaExternalSemaphoreHandleType hipExternalSemaphoreHandleType;
+#define hipExternalSemaphoreHandleTypeOpaqueFd cudaExternalSemaphoreHandleTypeOpaqueFd
+#define hipExternalSemaphoreHandleTypeOpaqueWin32 cudaExternalSemaphoreHandleTypeOpaqueWin32
+#define hipExternalSemaphoreHandleTypeOpaqueWin32Kmt cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+#define hipExternalSemaphoreHandleTypeD3D12Fence cudaExternalSemaphoreHandleTypeD3D12Fence
+#if CUDA_VERSION >= CUDA_10020
+#define hipExternalSemaphoreHandleTypeD3D11Fence cudaExternalSemaphoreHandleTypeD3D11Fence
+#define hipExternalSemaphoreHandleTypeNvSciSync cudaExternalSemaphoreHandleTypeNvSciSync
+#define hipExternalSemaphoreHandleTypeKeyedMutex cudaExternalSemaphoreHandleTypeKeyedMutex
+#define hipExternalSemaphoreHandleTypeKeyedMutexKmt cudaExternalSemaphoreHandleTypeKeyedMutexKmt
+#endif
+#if CUDA_VERSION >= CUDA_11020
+#define hipExternalSemaphoreHandleTypeTimelineSemaphoreFd cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd
+#define hipExternalSemaphoreHandleTypeTimelineSemaphoreWin32 cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
+#endif
+
 typedef struct cudaExternalSemaphoreHandleDesc hipExternalSemaphoreHandleDesc;
 typedef cudaExternalSemaphore_t hipExternalSemaphore_t;
 typedef struct cudaExternalSemaphoreSignalParams hipExternalSemaphoreSignalParams;

From 6767dbd76919028cb593c4a40860304e752be290 Mon Sep 17 00:00:00 2001
From: Rakesh Roy <rakesh.roy@amd.com>
Date: Thu, 4 Aug 2022 17:27:12 +0530
Subject: [PATCH 073/177] SWDEV-348820 - Fix hipPointerAttribute_t
 incompatibility

- Use hipPointerAttribute_t.type instead of hipPointerAttribute_t.memoryType

Change-Id: I3bf1c0758bdae987213ba7de62247dd420ef4cc9
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index cd43a6265a..43de20a8ff 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2227,13 +2227,13 @@ inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attribut
 #endif
         switch (memType) {
             case cudaMemoryTypeDevice:
-                attributes->memoryType = hipMemoryTypeDevice;
+                attributes->type = hipMemoryTypeDevice;
                 break;
             case cudaMemoryTypeHost:
-                attributes->memoryType = hipMemoryTypeHost;
+                attributes->type = hipMemoryTypeHost;
                 break;
             case cudaMemoryTypeManaged:
-                attributes->memoryType = hipMemoryTypeManaged;
+                attributes->type = hipMemoryTypeManaged;
                 break;
             default:
                 return hipErrorInvalidValue;

From d1fc24e5831fa2bd6720be214a694135a6f8fce9 Mon Sep 17 00:00:00 2001
From: Your Name <ajay.gunashekar@amd.com>
Date: Mon, 21 Nov 2022 09:43:25 -0800
Subject: [PATCH 074/177] SWDEV-368477 -
 hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags nvidia mapping

SWDEV-369618 - hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags invalid flag scenario
Change-Id: I2edbab5d3eeacf80bc215f0a77a46d733f4b4ec9
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h       | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 43de20a8ff..91d4183422 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -381,6 +381,7 @@ typedef enum cudaResourceViewFormat hipResourceViewFormat;
 #define hipIpcMemLazyEnablePeerAccess cudaIpcMemLazyEnablePeerAccess
 
 #define hipOccupancyDefault cudaOccupancyDefault
+#define hipOccupancyDisableCachingOverride cudaOccupancyDisableCachingOverride
 
 #define hipCooperativeLaunchMultiDeviceNoPreSync    \
         cudaCooperativeLaunchMultiDeviceNoPreSync
@@ -2812,6 +2813,17 @@ inline static hipError_t hipOccupancyMaxPotentialBlockSize(int* minGridSize, int
                                                            dynamicSMemSize, blockSizeLimit));
 }
 
+template <typename UnaryFunction, class T>
+inline static hipError_t hipOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(int* min_grid_size,
+                                                                                int* block_size,
+                                                                                T func,
+                                                                                UnaryFunction block_size_to_dynamic_smem_size,
+                                                                                int block_size_limit = 0,
+                                                                                unsigned int flags = 0) {
+    return hipCUDAErrorTohipError(cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(min_grid_size, block_size, func,
+                                                    block_size_to_dynamic_smem_size, block_size_limit,flags));
+}
+
 template <class T>
 inline static hipError_t hipOccupancyMaxPotentialBlockSizeWithFlags(int* minGridSize, int* blockSize, T func,
                                                            size_t dynamicSMemSize = 0,

From b222f4f13dc960452026e65728d6db3d2dc63bba Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Fri, 25 Nov 2022 18:08:50 +0500
Subject: [PATCH 075/177] SWDEV-369955 - Support hipDrvGetErrorString and
 hipDrvGetErrorName on Nvidia headers

Change-Id: I54cfa8faed2f45f310682625f94952aa3c2ee27f
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 138 ++++++++++++++++++
 1 file changed, 138 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 91d4183422..dd7e52a19d 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -827,6 +827,136 @@ inline static hipError_t hipCUResultTohipError(CUresult cuError) {
     }
 }
 
+inline static CUresult hipErrorToCUResult(hipError_t hError) {
+    switch (hError) {
+        case hipSuccess:
+            return CUDA_SUCCESS;
+        case hipErrorOutOfMemory:
+            return CUDA_ERROR_OUT_OF_MEMORY;
+        case hipErrorInvalidValue:
+            return CUDA_ERROR_INVALID_VALUE;
+        case hipErrorInvalidDevice:
+            return CUDA_ERROR_INVALID_DEVICE;
+        case hipErrorDeinitialized:
+            return CUDA_ERROR_DEINITIALIZED;
+        case hipErrorNoDevice:
+            return CUDA_ERROR_NO_DEVICE;
+        case hipErrorInvalidContext:
+            return CUDA_ERROR_INVALID_CONTEXT;
+        case hipErrorNotInitialized:
+            return CUDA_ERROR_NOT_INITIALIZED;
+        case hipErrorInvalidHandle:
+            return CUDA_ERROR_INVALID_HANDLE;
+        case hipErrorMapFailed:
+            return CUDA_ERROR_MAP_FAILED;
+        case hipErrorProfilerDisabled:
+            return CUDA_ERROR_PROFILER_DISABLED;
+        case hipErrorProfilerNotInitialized:
+            return CUDA_ERROR_PROFILER_NOT_INITIALIZED;
+        case hipErrorProfilerAlreadyStarted:
+            return CUDA_ERROR_PROFILER_ALREADY_STARTED;
+        case hipErrorProfilerAlreadyStopped:
+            return CUDA_ERROR_PROFILER_ALREADY_STOPPED;
+        case hipErrorInvalidImage:
+            return CUDA_ERROR_INVALID_IMAGE;
+        case hipErrorContextAlreadyCurrent:
+            return CUDA_ERROR_CONTEXT_ALREADY_CURRENT;
+        case hipErrorUnmapFailed:
+            return CUDA_ERROR_UNMAP_FAILED;
+        case hipErrorArrayIsMapped:
+            return CUDA_ERROR_ARRAY_IS_MAPPED;
+        case hipErrorAlreadyMapped:
+            return CUDA_ERROR_ALREADY_MAPPED;
+        case hipErrorNoBinaryForGpu:
+            return CUDA_ERROR_NO_BINARY_FOR_GPU;
+        case hipErrorAlreadyAcquired:
+            return CUDA_ERROR_ALREADY_ACQUIRED;
+        case hipErrorNotMapped:
+            return CUDA_ERROR_NOT_MAPPED;
+        case hipErrorNotMappedAsArray:
+            return CUDA_ERROR_NOT_MAPPED_AS_ARRAY;
+        case hipErrorNotMappedAsPointer:
+            return CUDA_ERROR_NOT_MAPPED_AS_POINTER;
+        case hipErrorECCNotCorrectable:
+            return CUDA_ERROR_ECC_UNCORRECTABLE;
+        case hipErrorUnsupportedLimit:
+            return CUDA_ERROR_UNSUPPORTED_LIMIT;
+        case hipErrorContextAlreadyInUse:
+            return CUDA_ERROR_CONTEXT_ALREADY_IN_USE;
+        case hipErrorPeerAccessUnsupported:
+            return CUDA_ERROR_PEER_ACCESS_UNSUPPORTED;
+        case hipErrorInvalidKernelFile:
+            return CUDA_ERROR_INVALID_PTX;
+        case hipErrorInvalidGraphicsContext:
+            return CUDA_ERROR_INVALID_GRAPHICS_CONTEXT;
+        case hipErrorInvalidSource:
+            return CUDA_ERROR_INVALID_SOURCE;
+        case hipErrorFileNotFound:
+            return CUDA_ERROR_FILE_NOT_FOUND;
+        case hipErrorSharedObjectSymbolNotFound:
+            return CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND;
+        case hipErrorSharedObjectInitFailed:
+            return CUDA_ERROR_SHARED_OBJECT_INIT_FAILED;
+        case hipErrorOperatingSystem:
+            return CUDA_ERROR_OPERATING_SYSTEM;
+        case hipErrorIllegalState:
+            return CUDA_ERROR_ILLEGAL_STATE;
+        case hipErrorNotFound:
+            return CUDA_ERROR_NOT_FOUND;
+        case hipErrorNotReady:
+            return CUDA_ERROR_NOT_READY;
+        case hipErrorIllegalAddress:
+            return CUDA_ERROR_ILLEGAL_ADDRESS;
+        case hipErrorLaunchOutOfResources:
+            return CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES;
+        case hipErrorLaunchTimeOut:
+            return CUDA_ERROR_LAUNCH_TIMEOUT;
+        case hipErrorPeerAccessAlreadyEnabled:
+            return CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED;
+        case hipErrorPeerAccessNotEnabled:
+            return CUDA_ERROR_PEER_ACCESS_NOT_ENABLED;
+        case hipErrorSetOnActiveProcess:
+            return CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE;
+        case hipErrorContextIsDestroyed:
+            return CUDA_ERROR_CONTEXT_IS_DESTROYED;
+        case hipErrorAssert:
+            return CUDA_ERROR_ASSERT;
+        case hipErrorHostMemoryAlreadyRegistered:
+            return CUDA_ERROR_HOST_MEMORY_ALREADY_REGISTERED;
+        case hipErrorHostMemoryNotRegistered:
+            return CUDA_ERROR_HOST_MEMORY_NOT_REGISTERED;
+        case hipErrorLaunchFailure:
+            return CUDA_ERROR_LAUNCH_FAILED;
+        case hipErrorCooperativeLaunchTooLarge:
+            return CUDA_ERROR_COOPERATIVE_LAUNCH_TOO_LARGE;
+        case hipErrorNotSupported:
+            return CUDA_ERROR_NOT_SUPPORTED;
+        case hipErrorStreamCaptureUnsupported:
+            return CUDA_ERROR_STREAM_CAPTURE_UNSUPPORTED;
+        case hipErrorStreamCaptureInvalidated:
+            return CUDA_ERROR_STREAM_CAPTURE_INVALIDATED;
+        case hipErrorStreamCaptureMerge:
+            return CUDA_ERROR_STREAM_CAPTURE_MERGE;
+        case hipErrorStreamCaptureUnmatched:
+            return CUDA_ERROR_STREAM_CAPTURE_UNMATCHED;
+        case hipErrorStreamCaptureUnjoined:
+            return CUDA_ERROR_STREAM_CAPTURE_UNJOINED;
+        case hipErrorStreamCaptureIsolation:
+            return CUDA_ERROR_STREAM_CAPTURE_ISOLATION;
+        case hipErrorStreamCaptureImplicit:
+            return CUDA_ERROR_STREAM_CAPTURE_IMPLICIT;
+        case hipErrorCapturedEvent:
+            return CUDA_ERROR_CAPTURED_EVENT;
+        case hipErrorStreamCaptureWrongThread:
+            return CUDA_ERROR_STREAM_CAPTURE_WRONG_THREAD;
+        case hipErrorGraphExecUpdateFailure:
+            return CUDA_ERROR_GRAPH_EXEC_UPDATE_FAILURE;
+        case hipErrorUnknown:
+        default:
+            return CUDA_ERROR_UNKNOWN;  // Note - translated error.
+    }
+}
+
 inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
     switch (hError) {
         case hipSuccess:
@@ -1702,6 +1832,14 @@ inline static const char* hipGetErrorName(hipError_t error) {
     return cudaGetErrorName(hipErrorToCudaError(error));
 }
 
+inline static hipError_t hipDrvGetErrorString(hipError_t error, const char** errorString) {
+    return hipCUResultTohipError(cuGetErrorString(hipErrorToCUResult(error), errorString));
+}
+
+inline static hipError_t hipDrvGetErrorName(hipError_t error, const char** errorString) {
+    return hipCUResultTohipError(cuGetErrorName(hipErrorToCUResult(error), errorString));
+}
+
 inline static hipError_t hipGetDeviceCount(int* count) {
     return hipCUDAErrorTohipError(cudaGetDeviceCount(count));
 }

From c1fe38aff6216b63ca2db646f527f8560f506c6c Mon Sep 17 00:00:00 2001
From: pghafari <pghafari@amd.com>
Date: Sun, 11 Dec 2022 22:28:36 -0500
Subject: [PATCH 076/177] SWDEV-369042 - updating to GetError to match cuda

Change-Id: I0ec2330443b26cb1c8cedba942f31fb1267cd09d
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h     | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index dd7e52a19d..712d793c51 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1833,11 +1833,21 @@ inline static const char* hipGetErrorName(hipError_t error) {
 }
 
 inline static hipError_t hipDrvGetErrorString(hipError_t error, const char** errorString) {
-    return hipCUResultTohipError(cuGetErrorString(hipErrorToCUResult(error), errorString));
+    CUresult err = hipErrorToCUResult(error);
+    if( err == CUDA_ERROR_UNKNOWN ) {
+       return hipCUResultTohipError(cuGetErrorString((CUresult)error, errorString));
+    } else {
+       return hipCUResultTohipError(cuGetErrorString(err, errorString));
+    }
 }
 
 inline static hipError_t hipDrvGetErrorName(hipError_t error, const char** errorString) {
-    return hipCUResultTohipError(cuGetErrorName(hipErrorToCUResult(error), errorString));
+    CUresult err = hipErrorToCUResult(error);
+    if( err == CUDA_ERROR_UNKNOWN ) {
+       return hipCUResultTohipError(cuGetErrorName((CUresult)error, errorString));
+    } else {
+       return hipCUResultTohipError(cuGetErrorName(err, errorString));
+    }
 }
 
 inline static hipError_t hipGetDeviceCount(int* count) {

From 98089093205f88ebd6f4442dd43d28167047eb72 Mon Sep 17 00:00:00 2001
From: Rakesh Roy <rakesh.roy@amd.com>
Date: Tue, 6 Dec 2022 00:04:55 +0530
Subject: [PATCH 077/177] SWDEV-368881 - Implement hipModuleLaunchCooperative*
 APIs

- Add implementation for hipModuleLaunchCooperativeKernel, hipModuleLaunchCooperativeKernelMultiDevice APIs

Change-Id: I2a80264e458549211c34b413397a08b57d021147
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 712d793c51..f561ed3876 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -437,6 +437,7 @@ typedef struct cudaArray* hipArray_t;
 typedef struct cudaArray* hipArray_const_t;
 typedef struct cudaFuncAttributes hipFuncAttributes;
 typedef struct cudaLaunchParams hipLaunchParams;
+typedef CUDA_LAUNCH_PARAMS hipFunctionLaunchParams;
 #define hipFunction_attribute CUfunction_attribute
 #define hipPointer_attribute CUpointer_attribute
 #define hip_Memcpy2D CUDA_MEMCPY2D
@@ -2807,11 +2808,29 @@ inline static hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim,
             cudaLaunchCooperativeKernel(f, gridDim, blockDim, kernelParams, sharedMemBytes, stream));
 }
 
+inline static hipError_t hipModuleLaunchCooperativeKernel(hipFunction_t f, unsigned int gridDimX,
+                                            unsigned int gridDimY, unsigned int gridDimZ,
+                                            unsigned int blockDimX, unsigned int blockDimY,
+                                            unsigned int blockDimZ, unsigned int sharedMemBytes,
+                                            hipStream_t stream, void** kernelParams) {
+    return hipCUResultTohipError(cuLaunchCooperativeKernel(f, gridDimX, gridDimY, gridDimZ,
+                                                           blockDimX, blockDimY, blockDimZ,
+                                                           sharedMemBytes, stream,kernelParams));
+}
+
 inline static hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsList,
                                                  int  numDevices, unsigned int  flags) {
     return hipCUDAErrorTohipError(cudaLaunchCooperativeKernelMultiDevice(launchParamsList, numDevices, flags));
 }
 
+inline static hipError_t hipModuleLaunchCooperativeKernelMultiDevice(
+                                                       hipFunctionLaunchParams* launchParamsList,
+                                                       unsigned int  numDevices,
+                                                       unsigned int  flags) {
+    return hipCUResultTohipError(cuLaunchCooperativeKernelMultiDevice(launchParamsList,
+                                                                      numDevices, flags));
+}
+
 inline static hipError_t hipImportExternalSemaphore(hipExternalSemaphore_t* extSem_out,
                                       const hipExternalSemaphoreHandleDesc* semHandleDesc) {
   return hipCUDAErrorTohipError(cudaImportExternalSemaphore(extSem_out,(const struct cudaExternalSemaphoreHandleDesc*)semHandleDesc));

From 9c531b6a674034fff2ecf32fca2feb2be69471b7 Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Mon, 9 Jan 2023 14:36:05 +0000
Subject: [PATCH 078/177] SWDEV-374405 - Added OpenGL interoperability APIs for
 nvidia

Change-Id: I1d6ad5c610ab6ee805bd07dacea316b3242ed129
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 51 +++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index f561ed3876..0c4fc9e5c5 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -27,6 +27,8 @@ THE SOFTWARE.
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_fp16.h>
+#include <cuda_gl_interop.h>
+
 #include <stdio.h>
 
 #define CUDA_9000 9000
@@ -1294,6 +1296,21 @@ typedef cudaExternalSemaphore_t hipExternalSemaphore_t;
 typedef struct cudaExternalSemaphoreSignalParams hipExternalSemaphoreSignalParams;
 typedef struct cudaExternalSemaphoreWaitParams hipExternalSemaphoreWaitParams;
 
+typedef enum cudaGLDeviceList hipGLDeviceList;
+#define hipGLDeviceListAll cudaGLDeviceListAll
+#define hipGLDeviceListCurrentFrame  cudaGLDeviceListCurrentFrame
+#define hipGLDeviceListNextFrame  cudaGLDeviceListNextFrame
+
+typedef struct cudaGraphicsResource hipGraphicsResource;
+typedef cudaGraphicsResource_t hipGraphicsResource_t;
+
+typedef enum cudaGraphicsRegisterFlags hipGraphicsRegisterFlags;
+#define hipGraphicsRegisterFlagsNone cudaGraphicsRegisterFlagsNone
+#define hipGraphicsRegisterFlagsReadOnly cudaGraphicsRegisterFlagsReadOnly
+#define hipGraphicsRegisterFlagsWriteDiscard cudaGraphicsRegisterFlagsWriteDiscard
+#define hipGraphicsRegisterFlagsSurfaceLoadStore cudaGraphicsRegisterFlagsSurfaceLoadStore
+#define hipGraphicsRegisterFlagsTextureGather cudaGraphicsRegisterFlagsTextureGather
+
 /**
  * graph types
  *
@@ -2863,6 +2880,40 @@ inline static hipError_t hipDestroyExternalMemory(hipExternalMemory_t extMem) {
   return hipCUDAErrorTohipError(cudaDestroyExternalMemory(extMem));
 }
 
+inline static hipError_t hipGLGetDevices(unsigned int* pHipDeviceCount, int* pHipDevices, unsigned int hipDeviceCount,
+                                         hipGLDeviceList deviceList) {
+  return hipCUDAErrorTohipError(cudaGLGetDevices(pHipDeviceCount, pHipDevices, hipDeviceCount, deviceList));
+}
+
+inline static hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint buffer, unsigned int flags) {
+  return hipCUDAErrorTohipError(cudaGraphicsGLRegisterBuffer(resource, buffer, flags));
+}
+
+inline static hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image, GLenum target, unsigned int flags) {
+  return hipCUDAErrorTohipError(cudaGraphicsGLRegisterImage(resource, image, target, flags));
+}
+
+inline static hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources, hipStream_t stream  __dparm(0)) {
+  return hipCUDAErrorTohipError(cudaGraphicsMapResources(count, resources, stream));
+}
+
+inline static hipError_t hipGraphicsSubResourceGetMappedArray(hipArray_t* array, hipGraphicsResource_t resource, unsigned int arrayIndex,
+                                                              unsigned int mipLevel) {
+  return hipCUDAErrorTohipError(cudaGraphicsSubResourceGetMappedArray(array, resource, arrayIndex, mipLevel));
+}
+
+inline static hipError_t hipGraphicsResourceGetMappedPointer(void** devPtr, size_t* size, hipGraphicsResource_t resource) {
+  return hipCUDAErrorTohipError(cudaGraphicsResourceGetMappedPointer(devPtr, size, resource));
+}
+
+inline static hipError_t hipGraphicsUnmapResources(int count, hipGraphicsResource_t* resources, hipStream_t stream  __dparm(0)) {
+  return hipCUDAErrorTohipError(cudaGraphicsUnmapResources(count, resources, stream));
+}
+
+inline static hipError_t hipGraphicsUnregisterResource(hipGraphicsResource_t resource) {
+  return hipCUDAErrorTohipError(cudaGraphicsUnregisterResource(resource));
+}
+
 #if CUDA_VERSION >= CUDA_11020
 // ========================== HIP Stream Ordered Memory Allocator =================================
 inline static hipError_t hipDeviceGetDefaultMemPool(hipMemPool_t* mem_pool, int device) {

From ec92380b663293de7029cc2255307665dcced00d Mon Sep 17 00:00:00 2001
From: German Andryeyev <German.Andryeyev@amd.com>
Date: Wed, 14 Dec 2022 18:54:16 -0500
Subject: [PATCH 079/177] SWDEV-353281 - Initial support of memalloc in graph

Add memory allocation support in graph. Current implementation uses
cache from mempool  to hold the allocations which belong to the graph.
Also the resource tracking is disabled at this moment because mempool
operates with hip::Stream objects, but graph has execution with
amd::HostQueue objects.

Change-Id: I54fe3250126d24f5a26ada975f37d429bb4ef17b
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 0c4fc9e5c5..8ac312b400 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1340,6 +1340,10 @@ typedef struct cudaHostNodeParams hipHostNodeParams;
 typedef struct cudaKernelNodeParams hipKernelNodeParams;
 typedef struct cudaMemsetParams hipMemsetParams;
 
+#if CUDA_VERSION >= CUDA_11040
+typedef struct cudaMemAllocNodeParams hipMemAllocNodeParams;
+#endif
+
 typedef enum cudaGraphExecUpdateResult hipGraphExecUpdateResult;
 #define hipGraphExecUpdateSuccess cudaGraphExecUpdateSuccess
 #define hipGraphExecUpdateError cudaGraphExecUpdateError
@@ -3208,6 +3212,30 @@ inline static hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec
                                                       unsigned long long flags) {
     return hipCUDAErrorTohipError(cudaGraphInstantiateWithFlags(pGraphExec, graph, flags));
 }
+
+inline hipError_t hipGraphAddMemAllocNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                          const hipGraphNode_t* pDependencies,
+                                          size_t numDependencies,
+                                          hipMemAllocNodeParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphAddMemAllocNode(
+        pGraphNode, graph, pDependencies, numDependencies, pNodeParams));
+}
+
+inline hipError_t hipGraphMemAllocNodeGetParams(hipGraphNode_t node,
+                                                hipMemAllocNodeParams* pNodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphMemAllocNodeGetParams(node, pNodeParams));
+}
+
+inline hipError_t hipGraphAddMemFreeNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                         const hipGraphNode_t* pDependencies,
+                                         size_t numDependencies, void* dev_ptr) {
+    return hipCUDAErrorTohipError(cudaGraphAddMemFreeNode(
+        pGraphNode, graph, pDependencies, numDependencies, dev_ptr));
+}
+
+inline hipError_t hipGraphMemFreeNodeGetParams(hipGraphNode_t node, void* dev_ptr) {
+    return hipCUDAErrorTohipError(cudaGraphMemFreeNodeGetParams(node, dev_ptr));
+}
 #endif
 inline static hipError_t hipGraphLaunch(hipGraphExec_t graphExec, hipStream_t stream) {
     return hipCUDAErrorTohipError(cudaGraphLaunch(graphExec, stream));

From cf18c50814fec0ce14058ce29ee3b512b82bf608 Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Sun, 15 Jan 2023 12:16:46 +0000
Subject: [PATCH 080/177] SWDEV-369961 - Add mappings for enum
 hipGraphInstantiateFlags and types.

Change-Id: I03fa8221684b08a0c44e12aaf2eb7ce5281be629
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 8ac312b400..0c492b7c66 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -543,6 +543,12 @@ typedef CUDA_RESOURCE_VIEW_DESC HIP_RESOURCE_VIEW_DESC;
 #define HIP_POINTER_ATTRIBUTE_ACCESS_FLAGS      CU_POINTER_ATTRIBUTE_ACCESS_FLAGS
 #define HIP_POINTER_ATTRIBUTE_MEMPOOL_HANDLE    CU_POINTER_ATTRIBUTE_MEMPOOL_HANDLE
 
+typedef enum cudaGraphInstantiateFlags hipGraphInstantiateFlags;
+#define hipGraphInstantiateFlagAutoFreeOnLaunch cudaGraphInstantiateFlagAutoFreeOnLaunch
+#define hipGraphInstantiateFlagUpload cudaGraphInstantiateFlagUpload
+#define hipGraphInstantiateFlagDeviceLaunch cudaGraphInstantiateFlagDeviceLaunch
+#define hipGraphInstantiateFlagUseNodePriority cudaGraphInstantiateFlagUseNodePriority
+
 #if CUDA_VERSION >= CUDA_9000
 #define __shfl(...)      __shfl_sync(0xffffffff, __VA_ARGS__)
 #define __shfl_up(...)   __shfl_up_sync(0xffffffff, __VA_ARGS__)

From bb635fd5ff16f3f9c56722512dab30d033ae4733 Mon Sep 17 00:00:00 2001
From: Rakesh Roy <rakesh.roy@amd.com>
Date: Wed, 17 Aug 2022 23:36:56 +0530
Subject: [PATCH 081/177] SWDEV-338733 - Implement hipArrayGet* APIs

- Add implementation for hipArrayGetInfo, hipArrayGetDescriptor &
  hipArray3DGetDescriptor APIs

Change-Id: I181a472066006bc3bd0d987408ea67e218310983
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h    | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 0c492b7c66..a06f9b45b9 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3186,6 +3186,21 @@ inline static hipError_t hipArray3DCreate(hiparray* pHandle,
     return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
 }
 
+inline static hipError_t hipArrayGetInfo(hipChannelFormatDesc* desc, hipExtent* extent,
+                                          unsigned int* flags, hipArray* array) {
+    return hipCUDAErrorTohipError(cudaArrayGetInfo(desc, extent, flags, array));
+}
+
+inline static hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor,
+                                               hipArray* array) {
+    return hipCUResultTohipError(cuArrayGetDescriptor(pArrayDescriptor, (CUarray)array));
+}
+
+inline static hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor,
+                                                 hipArray* array) {
+    return hipCUResultTohipError(cuArray3DGetDescriptor(pArrayDescriptor, (CUarray)array));
+}
+
 inline static hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode) {
     return hipCUDAErrorTohipError(cudaStreamBeginCapture(stream, mode));
 }

From 3fc34b8bf50a5c94ea969f888ebf4651127b0032 Mon Sep 17 00:00:00 2001
From: Ajay <ajay.gunashekar@amd.com>
Date: Fri, 17 Feb 2023 22:46:39 +0000
Subject: [PATCH 082/177] SWDEV-384100 - HIP support for CUDA 12.0

Apps are failing to build due to undefined deprecated texture APIs

Change-Id: I1fb64adc4bc0ba6ee6ecaa65d54b34da0327e6a3
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a06f9b45b9..d770182632 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -39,6 +39,7 @@ THE SOFTWARE.
 #define CUDA_11030 11030
 #define CUDA_11040 11040
 #define CUDA_11060 11060
+#define CUDA_12000 12000
 
 #ifdef __cplusplus
 extern "C" {
@@ -2773,6 +2774,7 @@ inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t
     return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
 }
 
+#if CUDA_VERSION < CUDA_12000
 __HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset,
                                                          struct textureReference* tex,
                                                          const void* devPtr,
@@ -2786,6 +2788,8 @@ __HIP_DEPRECATED inline static hipError_t hipBindTexture2D(
     const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) {
     return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
 }
+#endif // CUDA_VERSION < CUDA_12000
+
 
 inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
                                                         hipChannelFormatKind f) {
@@ -2818,10 +2822,12 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe
     return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject));
 }
 
+#if CUDA_VERSION < CUDA_12000
 __HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
     size_t* offset, const struct textureReference* texref) {
     return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));
 }
+#endif
 
 inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
 {
@@ -3067,6 +3073,7 @@ inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
                                                                  blockSize, dynamicSMemSize, flags));
 }
 
+#if CUDA_VERSION < CUDA_12000
 template <class T, int dim, enum cudaTextureReadMode readMode>
 inline static hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
                                         const void* devPtr, size_t size = UINT_MAX) {
@@ -3109,6 +3116,7 @@ __HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
     struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
     return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array));
 }
+#endif   // CUDA_VERSION < CUDA_12000
 
 template <class T>
 inline static hipChannelFormatDesc hipCreateChannelDesc() {

From 8ce9ec60f8071d192b1c4d42e3121e1ab726062b Mon Sep 17 00:00:00 2001
From: Jatin Chaudhary <JatinJaikishan.Chaudhary@amd.com>
Date: Tue, 17 Jan 2023 10:40:05 +0000
Subject: [PATCH 083/177] SWDEV-372153 - Add hipStreamGetDevice Implementation

Change-Id: Ifd1f13e311e8221ca6d94cf27f9131eb97678067
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h     | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index d770182632..4c8be9af3d 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2507,6 +2507,20 @@ inline static hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallb
         cudaStreamAddCallback(stream, (cudaStreamCallback_t)callback, userData, flags));
 }
 
+inline static hipError_t hipStreamGetDevice(hipStream_t stream, hipDevice_t* device) {
+    hipCtx_t context;
+    auto err = hipCUResultTohipError(cuStreamGetCtx(stream, &context));
+    if (err != hipSuccess) return err;
+
+    err = hipCUResultTohipError(cuCtxPushCurrent(context));
+    if (err != hipSuccess) return err;
+
+    err = hipCUResultTohipError(cuCtxGetDevice(device));
+    if (err != hipSuccess) return err;
+
+    return hipCUResultTohipError(cuCtxPopCurrent(&context));
+}
+
 inline static hipError_t hipDriverGetVersion(int* driverVersion) {
     return hipCUDAErrorTohipError(cudaDriverGetVersion(driverVersion));
 }

From 81717d6329a434551047ce612d0aedc10ddcc819 Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <anusha.godavarthysurya@amd.com>
Date: Tue, 21 Feb 2023 21:45:14 +0000
Subject: [PATCH 084/177] SWDEV-330658 - Added flag hipHostRegisterReadOnly

Change-Id: Idb59dc6187e99512546dfeafde44c08ae85f6057
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 4c8be9af3d..2c8ca320e8 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -39,7 +39,6 @@ THE SOFTWARE.
 #define CUDA_11030 11030
 #define CUDA_11040 11040
 #define CUDA_11060 11060
-#define CUDA_12000 12000
 
 #ifdef __cplusplus
 extern "C" {
@@ -374,6 +373,7 @@ typedef enum cudaResourceViewFormat hipResourceViewFormat;
 #define hipHostRegisterPortable cudaHostRegisterPortable
 #define hipHostRegisterMapped cudaHostRegisterMapped
 #define hipHostRegisterIoMemory cudaHostRegisterIoMemory
+#define hipHostRegisterReadOnly cudaHostRegisterReadOnly
 
 #define HIP_LAUNCH_PARAM_BUFFER_POINTER CU_LAUNCH_PARAM_BUFFER_POINTER
 #define HIP_LAUNCH_PARAM_BUFFER_SIZE CU_LAUNCH_PARAM_BUFFER_SIZE
@@ -2788,7 +2788,6 @@ inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t
     return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
 }
 
-#if CUDA_VERSION < CUDA_12000
 __HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset,
                                                          struct textureReference* tex,
                                                          const void* devPtr,
@@ -2802,8 +2801,6 @@ __HIP_DEPRECATED inline static hipError_t hipBindTexture2D(
     const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) {
     return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
 }
-#endif // CUDA_VERSION < CUDA_12000
-
 
 inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
                                                         hipChannelFormatKind f) {
@@ -2836,12 +2833,10 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe
     return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject));
 }
 
-#if CUDA_VERSION < CUDA_12000
 __HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
     size_t* offset, const struct textureReference* texref) {
     return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));
 }
-#endif
 
 inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
 {
@@ -3087,7 +3082,6 @@ inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
                                                                  blockSize, dynamicSMemSize, flags));
 }
 
-#if CUDA_VERSION < CUDA_12000
 template <class T, int dim, enum cudaTextureReadMode readMode>
 inline static hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
                                         const void* devPtr, size_t size = UINT_MAX) {
@@ -3130,7 +3124,6 @@ __HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
     struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
     return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array));
 }
-#endif   // CUDA_VERSION < CUDA_12000
 
 template <class T>
 inline static hipChannelFormatDesc hipCreateChannelDesc() {

From cafad8f51e3a5dadba36c2b2697e1e65a3d2898e Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Tue, 7 Mar 2023 14:34:10 +0000
Subject: [PATCH 085/177] SWDEV-387173 - Added cuda mappings for
 hipGraphMemAttr* enums

Change-Id: Ie5485ec518c6d107901a429ac3614e22caa9196b
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 2c8ca320e8..0522cc8ef7 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1383,6 +1383,10 @@ typedef enum cudaAccessProperty hipAccessProperty;
 typedef struct cudaAccessPolicyWindow hipAccessPolicyWindow;
 
 typedef enum  cudaGraphMemAttributeType hipGraphMemAttributeType;
+#define hipGraphMemAttrUsedMemCurrent cudaGraphMemAttrUsedMemCurrent
+#define hipGraphMemAttrUsedMemHigh cudaGraphMemAttrUsedMemHigh
+#define hipGraphMemAttrReservedMemCurrent cudaGraphMemAttrReservedMemCurrent
+#define hipGraphMemAttrReservedMemHigh cudaGraphMemAttrReservedMemHigh
 
 typedef enum cudaUserObjectFlags hipUserObjectFlags;
 #define hipUserObjectNoDestructorSync cudaUserObjectNoDestructorSync

From cedc75f7e86256cb41cd6ac988242ce47fb609b8 Mon Sep 17 00:00:00 2001
From: Ajay <ajay.gunashekar@amd.com>
Date: Wed, 22 Mar 2023 16:45:42 +0000
Subject: [PATCH 086/177] SWDEV-390170 - guard deprecated runtime apis on CUDA
 12.0

Change-Id: I4ab967c276dfa98f75d508b9484e8d1324b4e4a7
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 0522cc8ef7..36ea5ca5fa 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -39,6 +39,7 @@ THE SOFTWARE.
 #define CUDA_11030 11030
 #define CUDA_11040 11040
 #define CUDA_11060 11060
+#define CUDA_12000 12000
 
 #ifdef __cplusplus
 extern "C" {
@@ -2792,6 +2793,7 @@ inline static hipError_t hipFuncSetCacheConfig(const void* func, hipFuncCache_t
     return hipCUDAErrorTohipError(cudaFuncSetCacheConfig(func, cacheConfig));
 }
 
+#if CUDA_VERSION < CUDA_12000
 __HIP_DEPRECATED inline static hipError_t hipBindTexture(size_t* offset,
                                                          struct textureReference* tex,
                                                          const void* devPtr,
@@ -2805,6 +2807,8 @@ __HIP_DEPRECATED inline static hipError_t hipBindTexture2D(
     const hipChannelFormatDesc* desc, size_t width, size_t height, size_t pitch) {
     return hipCUDAErrorTohipError(cudaBindTexture2D(offset, tex, devPtr, desc, width, height, pitch));
 }
+#endif // CUDA_VERSION < CUDA_12000
+
 
 inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int w,
                                                         hipChannelFormatKind f) {
@@ -2837,10 +2841,12 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe
     return hipCUDAErrorTohipError(cudaGetTextureObjectResourceDesc( pResDesc, textureObject));
 }
 
+#if CUDA_VERSION < CUDA_12000
 __HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
     size_t* offset, const struct textureReference* texref) {
     return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));
 }
+#endif
 
 inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
 {
@@ -3086,6 +3092,7 @@ inline static hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
                                                                  blockSize, dynamicSMemSize, flags));
 }
 
+#if CUDA_VERSION < CUDA_12000
 template <class T, int dim, enum cudaTextureReadMode readMode>
 inline static hipError_t hipBindTexture(size_t* offset, const struct texture<T, dim, readMode>& tex,
                                         const void* devPtr, size_t size = UINT_MAX) {
@@ -3128,6 +3135,7 @@ __HIP_DEPRECATED inline static hipError_t hipBindTextureToArray(
     struct texture<T, dim, readMode>& tex, hipArray_const_t array) {
     return hipCUDAErrorTohipError(cudaBindTextureToArray(tex, array));
 }
+#endif   // CUDA_VERSION < CUDA_12000
 
 template <class T>
 inline static hipChannelFormatDesc hipCreateChannelDesc() {

From 0f95d1d31da2296c3b0fa82a3b9fbf6adf408904 Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Tue, 25 Apr 2023 18:25:10 +0100
Subject: [PATCH 087/177] SWDEV-393199 - Added new include file for opengl
 interop mappings for nvidia

Change-Id: I2e955a9dd06539939188a5f2e1dde4f173af1202
---
 .../hip/nvidia_detail/nvidia_hip_gl_interop.h | 44 +++++++++++++++++++
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 19 --------
 2 files changed, 44 insertions(+), 19 deletions(-)
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hip_gl_interop.h

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_gl_interop.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_gl_interop.h
new file mode 100644
index 0000000000..000d5e7c0d
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_gl_interop.h
@@ -0,0 +1,44 @@
+/*
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+#ifndef HIP_INCLUDE_NVIDIA_HIP_GL_INTEROP_H
+#define HIP_INCLUDE_NVIDIA_HIP_GL_INTEROP_H
+
+#include <cuda_gl_interop.h>
+
+typedef enum cudaGLDeviceList hipGLDeviceList;
+#define hipGLDeviceListAll cudaGLDeviceListAll
+#define hipGLDeviceListCurrentFrame  cudaGLDeviceListCurrentFrame
+#define hipGLDeviceListNextFrame  cudaGLDeviceListNextFrame
+
+inline static hipError_t hipGLGetDevices(unsigned int* pHipDeviceCount, int* pHipDevices, unsigned int hipDeviceCount,
+                                         hipGLDeviceList deviceList) {
+  return hipCUDAErrorTohipError(cudaGLGetDevices(pHipDeviceCount, pHipDevices, hipDeviceCount, deviceList));
+}
+
+inline static hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint buffer, unsigned int flags) {
+  return hipCUDAErrorTohipError(cudaGraphicsGLRegisterBuffer(resource, buffer, flags));
+}
+
+inline static hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image, GLenum target, unsigned int flags) {
+  return hipCUDAErrorTohipError(cudaGraphicsGLRegisterImage(resource, image, target, flags));
+}
+#endif
\ No newline at end of file
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 36ea5ca5fa..0a1be4296e 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -27,7 +27,6 @@ THE SOFTWARE.
 #include <cuda.h>
 #include <cuda_profiler_api.h>
 #include <cuda_fp16.h>
-#include <cuda_gl_interop.h>
 
 #include <stdio.h>
 
@@ -1304,11 +1303,6 @@ typedef cudaExternalSemaphore_t hipExternalSemaphore_t;
 typedef struct cudaExternalSemaphoreSignalParams hipExternalSemaphoreSignalParams;
 typedef struct cudaExternalSemaphoreWaitParams hipExternalSemaphoreWaitParams;
 
-typedef enum cudaGLDeviceList hipGLDeviceList;
-#define hipGLDeviceListAll cudaGLDeviceListAll
-#define hipGLDeviceListCurrentFrame  cudaGLDeviceListCurrentFrame
-#define hipGLDeviceListNextFrame  cudaGLDeviceListNextFrame
-
 typedef struct cudaGraphicsResource hipGraphicsResource;
 typedef cudaGraphicsResource_t hipGraphicsResource_t;
 
@@ -2915,19 +2909,6 @@ inline static hipError_t hipDestroyExternalMemory(hipExternalMemory_t extMem) {
   return hipCUDAErrorTohipError(cudaDestroyExternalMemory(extMem));
 }
 
-inline static hipError_t hipGLGetDevices(unsigned int* pHipDeviceCount, int* pHipDevices, unsigned int hipDeviceCount,
-                                         hipGLDeviceList deviceList) {
-  return hipCUDAErrorTohipError(cudaGLGetDevices(pHipDeviceCount, pHipDevices, hipDeviceCount, deviceList));
-}
-
-inline static hipError_t hipGraphicsGLRegisterBuffer(hipGraphicsResource** resource, GLuint buffer, unsigned int flags) {
-  return hipCUDAErrorTohipError(cudaGraphicsGLRegisterBuffer(resource, buffer, flags));
-}
-
-inline static hipError_t hipGraphicsGLRegisterImage(hipGraphicsResource** resource, GLuint image, GLenum target, unsigned int flags) {
-  return hipCUDAErrorTohipError(cudaGraphicsGLRegisterImage(resource, image, target, flags));
-}
-
 inline static hipError_t hipGraphicsMapResources(int count, hipGraphicsResource_t* resources, hipStream_t stream  __dparm(0)) {
   return hipCUDAErrorTohipError(cudaGraphicsMapResources(count, resources, stream));
 }

From e5e95946a129397c6dcf0b017bd94f44fbe01d33 Mon Sep 17 00:00:00 2001
From: Ajay <ajay.gunashekar@amd.com>
Date: Thu, 13 Apr 2023 22:37:16 +0000
Subject: [PATCH 088/177] SWDEV-394488 - cudaStreamGetCaptureInfo_v2 is
 undefined in CUDA 12.0

hipStreamPerThrdCompilerOptn.cc test fails to build with cudaStreamGetCaptureInfo_v2
in CUDA 12.0.
fix was to change runtime API cudaStreamGetCaptureInfo_v2
to Driver cuStreamGetCaptureInfo_v2

Change-Id: I44a0110770d3246f5345092acae301c9a2f6d520
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 0a1be4296e..6236637886 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3493,8 +3493,10 @@ inline static hipError_t hipStreamGetCaptureInfo_v2(
     hipStream_t stream, hipStreamCaptureStatus* captureStatus_out,
     unsigned long long* id_out __dparm(0), hipGraph_t* graph_out __dparm(0),
     const hipGraphNode_t** dependencies_out __dparm(0), size_t* numDependencies_out __dparm(0)) {
-    return hipCUDAErrorTohipError(cudaStreamGetCaptureInfo_v2(
-        stream, captureStatus_out, id_out, graph_out, dependencies_out, numDependencies_out));
+    return hipCUResultTohipError(cuStreamGetCaptureInfo_v2(
+        stream, reinterpret_cast<CUstreamCaptureStatus *>(captureStatus_out),
+        reinterpret_cast<cuuint64_t *>(id_out), graph_out,
+        dependencies_out, numDependencies_out));
 }
 #endif
 

From ea190a25713d5f29c46f5cd534517a341f6294df Mon Sep 17 00:00:00 2001
From: Jatin Chaudhary <JatinJaikishan.Chaudhary@amd.com>
Date: Wed, 10 May 2023 19:30:39 +0100
Subject: [PATCH 089/177] SWDEV-367537 - Add __hip_bfloat16 and vector
 definitions to match __nv_bfloat16

Change-Id: I1c0f9f5f278c2c3b4e175d9f08831ba458ed856e
---
 .../hip/nvidia_detail/nvidia_hip_bf16.h       | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 hipnv/include/hip/nvidia_detail/nvidia_hip_bf16.h

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_bf16.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_bf16.h
new file mode 100644
index 0000000000..118996af1d
--- /dev/null
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_bf16.h
@@ -0,0 +1,32 @@
+/*
+Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+
+#ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_FP16_H
+#define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_FP16_H
+
+#include <cuda_bf16.h>
+
+typedef struct __nv_bfloat16 __hip_bfloat16;
+typedef struct __nv_bfloat162 __hip_bfloat162;
+
+#endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_FP16_H

From e5e4a16996188e8959779325b1a3d3401f51fb1e Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Tue, 23 May 2023 22:01:54 +0530
Subject: [PATCH 090/177] SWDEV-401850 - Fix hipCreateChannelDescHalf APIs

- Address ChannelDescHalf1 and ChannelDescHalf2 APIs not returning
correct Channel Descriptors in amd headers
- Add missing hipCreateChannelDescHalf APIs in nvidia headers

Change-Id: I558847425c2459d3cde329ca2e926d882bb0a9dd
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 6236637886..f2033be250 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2809,6 +2809,26 @@ inline static hipChannelFormatDesc hipCreateChannelDesc(int x, int y, int z, int
     return cudaCreateChannelDesc(x, y, z, w, hipChannelFormatKindToCudaChannelFormatKind(f));
 }
 
+inline static hipChannelFormatDesc hipCreateChannelDescHalf() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+inline static hipChannelFormatDesc hipCreateChannelDescHalf1() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return cudaCreateChannelDesc(e, 0, 0, 0, cudaChannelFormatKindFloat);
+}
+
+inline static hipChannelFormatDesc hipCreateChannelDescHalf2() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return cudaCreateChannelDesc(e, e, 0, 0, cudaChannelFormatKindFloat);
+}
+
+inline static hipChannelFormatDesc hipCreateChannelDescHalf4() {
+    int e = (int)sizeof(unsigned short) * 8;
+    return cudaCreateChannelDesc(e, e, e, e, cudaChannelFormatKindFloat);
+}
+
 inline static hipError_t hipCreateTextureObject(hipTextureObject_t* pTexObject,
                                                 const hipResourceDesc* pResDesc,
                                                 const hipTextureDesc* pTexDesc,

From 01dc9ce02e574bf739ebb11c70e34ea5e54b2adc Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Mon, 29 May 2023 10:37:18 +0000
Subject: [PATCH 091/177] SWDEV-401847 - Update atomicMin/Max for float and
 double.

Change-Id: Ib5be459b8a24f0739e299ed12c9f877f8baa02b1
---
 .../hip/nvidia_detail/nvidia_hip_atomics.h    | 58 ++++++++-----------
 1 file changed, 25 insertions(+), 33 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_atomics.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_atomics.h
index f9a92d582a..19fa9673b9 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_atomics.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_atomics.h
@@ -25,51 +25,43 @@ THE SOFTWARE.
 
 
 __device__ inline float atomicMax(float* addr, float val) {
-    unsigned int *uaddr = (unsigned int *)addr;
-    float value = __uint_as_float(*uaddr);
-
-    while (value < val) {
-        value = __uint_as_float(atomicCAS(uaddr, __float_as_uint(value),
-                                                 __float_as_uint(val)));
+    int ret = __float_as_int(*addr);
+    while (val > __int_as_float(ret)) {
+        int old = ret;
+        if ((ret = atomicCAS((int *)addr, old, __float_as_int(val))) == old)
+            break;
     }
-    return value;
+    return __int_as_float(ret);
 }
-
 __device__ inline double atomicMax(double* addr, double val) {
-    unsigned long long* uaddr  = (unsigned long long *)addr;
-    double value = __longlong_as_double(*uaddr);
-
-    while (value < val) {
-        value = __longlong_as_double(atomicCAS(uaddr,
-                                        __double_as_longlong(value),
-                                        __double_as_longlong(val)));
+    unsigned long long ret = __double_as_longlong(*addr);
+    while (val > __longlong_as_double(ret)) {
+        unsigned long long old = ret;
+        if ((ret = atomicCAS((unsigned long long *)addr, old, __double_as_longlong(val))) == old)
+            break;
     }
-
-    return value;
+    return __longlong_as_double(ret);
 }
 
 __device__ inline float atomicMin(float* addr, float val) {
-    unsigned int *uaddr = (unsigned int *)addr;
-    float value = __uint_as_float(*uaddr);
-
-    while (value > val) {
-        value = __uint_as_float(atomicCAS(uaddr, __float_as_uint(value),
-                                                 __float_as_uint(val)));
+    int ret = __float_as_int(*addr);
+    while (val < __int_as_float(ret)) {
+        int old = ret;
+        if ((ret = atomicCAS((int *)addr, old, __float_as_int(val))) == old)
+            break;
     }
-    return value;
+    return __int_as_float(ret);
 }
 
 __device__ inline double atomicMin(double* addr, double val) {
-    unsigned long long* uaddr  = (unsigned long long *)addr;
-    double value = __longlong_as_double(*uaddr);
-
-    while (value > val) {
-        value = __longlong_as_double(atomicCAS(uaddr,
-                                         __double_as_longlong(value),
-                                         __double_as_longlong(val)));
+    unsigned long long ret = __double_as_longlong(*addr);
+    while (val < __longlong_as_double(ret)) {
+        unsigned long long old = ret;
+        if ((ret = atomicCAS((unsigned long long *)addr, old, __double_as_longlong(val))) == old)
+            break;
     }
-
-    return value;
+    return __longlong_as_double(ret);
 }
 
+
 #endif

From 7d423570b0763832794758e0d0c7632f808f1e14 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Thu, 18 May 2023 10:59:36 +0000
Subject: [PATCH 092/177] SWDEV-395996 - Add double precision constants to math
 constants header

Fixes a typo and adds missing constants matching cuda

Change-Id: Iebf47d107af361cbd356c2a3b9cd6ef121d01363
---
 .../nvidia_detail/nvidia_hip_math_constants.h | 70 ++++++++++++++++++-
 1 file changed, 67 insertions(+), 3 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_math_constants.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_math_constants.h
index 7650bb0dec..8b53e853f7 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_math_constants.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_math_constants.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015 - 2022 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015 - 2023 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -21,7 +21,10 @@ THE SOFTWARE.
 */
 #ifndef NVIDIA_HIP_MATH_CONSTANTS_H
 #define NVIDIA_HIP_MATH_CONSTANTS_H
+
 #include <math_constants.h>
+
+// single precision constants
 #define HIP_INF_F            CUDART_INF_F
 #define HIP_NAN_F            CUDART_NAN_F
 #define HIP_MIN_DENORM_F     CUDART_MIN_DENORM_F
@@ -57,6 +60,67 @@ THE SOFTWARE.
 #define HIP_REMQUO_BITS_F    CUDART_REMQUO_BITS_F
 #define HIP_REMQUO_MASK_F    CUDART_REMQUO_MASK_F
 #define HIP_TRIG_PLOSS_F     CUDART_TRIG_PLOSS_F
+
+// double precision constants
+#define HIP_INF              CUDART_INF
+#define HIP_NAN              CUDART_NAN
+#define HIP_NEG_ZERO         CUDART_NEG_ZERO
+#define HIP_MIN_DENORM       CUDART_MIN_DENORM
+#define HIP_ZERO             CUDART_ZERO
+#define HIP_ONE              CUDART_ONE
+#define HIP_SQRT_TWO         CUDART_SQRT_TWO
+#define HIP_SQRT_HALF        CUDART_SQRT_HALF
+#define HIP_SQRT_HALF_HI     CUDART_SQRT_HALF_HI
+#define HIP_SQRT_HALF_LO     CUDART_SQRT_HALF_LO
+#define HIP_THIRD            CUDART_THIRD
+#define HIP_TWOTHIRD         CUDART_TWOTHIRD
+#define HIP_PIO4             CUDART_PIO4
+#define HIP_PIO4_HI          CUDART_PIO4_HI
+#define HIP_PIO4_LO          CUDART_PIO4_LO
+#define HIP_PIO2             CUDART_PIO2
+#define HIP_PIO2_HI          CUDART_PIO2_HI
+#define HIP_PIO2_LO          CUDART_PIO2_LO
+#define HIP_3PIO4            CUDART_3PIO4
+#define HIP_2_OVER_PI        CUDART_2_OVER_PI
+#define HIP_PI               CUDART_PI
+#define HIP_PI_HI            CUDART_PI_HI
+#define HIP_PI_LO            CUDART_PI_LO
+#define HIP_SQRT_2PI         CUDART_SQRT_2PI
+#define HIP_SQRT_2PI_HI      CUDART_SQRT_2PI_HI
+#define HIP_SQRT_2PI_LO      CUDART_SQRT_2PI_LO
+#define HIP_SQRT_PIO2        CUDART_SQRT_PIO2
+#define HIP_SQRT_PIO2_HI     CUDART_SQRT_PIO2_HI
+#define HIP_SQRT_PIO2_LO     CUDART_SQRT_PIO2_LO
+#define HIP_SQRT_2OPI        CUDART_SQRT_2OPI
+#define HIP_L2E              CUDART_L2E
+#define HIP_L2E_HI           CUDART_L2E_HI
+#define HIP_L2E_LO           CUDART_L2E_LO
+#define HIP_L2T              CUDART_L2T
+#define HIP_LG2              CUDART_LG2
+#define HIP_LG2_HI           CUDART_LG2_HI
+#define HIP_LG2_LO           CUDART_LG2_LO
+#define HIP_LGE              CUDART_LGE
+#define HIP_LGE_HI           CUDART_LGE_HI
+#define HIP_LGE_LO           CUDART_LGE_LO
+#define HIP_LN2              CUDART_LN2
+#define HIP_LN2_HI           CUDART_LN2_HI
+#define HIP_LN2_LO           CUDART_LN2_LO
+#define HIP_LNT              CUDART_LNT
+#define HIP_LNT_HI           CUDART_LNT_HI
+#define HIP_LNT_LO           CUDART_LNT_LO
+#define HIP_LNPI             CUDART_LNPI
+#define HIP_LN2_X_1024       CUDART_LN2_X_1024
+#define HIP_LN2_X_1025       CUDART_LN2_X_1025
+#define HIP_LN2_X_1075       CUDART_LN2_X_1075
+#define HIP_LG2_X_1024       CUDART_LG2_X_1024
+#define HIP_LG2_X_1075       CUDART_LG2_X_1075
+#define HIP_TWO_TO_23        CUDART_TWO_TO_23
+#define HIP_TWO_TO_52        CUDART_TWO_TO_52
+#define HIP_TWO_TO_53        CUDART_TWO_TO_53
+#define HIP_TWO_TO_54        CUDART_TWO_TO_54
+#define HIP_TWO_TO_M54       CUDART_TWO_TO_M54
+#define HIP_TWO_TO_M1022     CUDART_TWO_TO_M1022
+#define HIP_TRIG_PLOSS       CUDART_TRIG_PLOSS
+#define HIP_DBL2INT_CVT      CUDART_DBL2INT_CVT
+
 #endif
-
-

From c0e2bd22787e54e89b31982e035c254106df478b Mon Sep 17 00:00:00 2001
From: Jatin Chaudhary <JatinJaikishan.Chaudhary@amd.com>
Date: Thu, 11 May 2023 23:25:05 +0100
Subject: [PATCH 093/177] SWDEV-400136 - change make_complex function name to
 make_hipComplex

Change-Id: Ia276610286df80ec35fe11f303eb9919eeda87dd
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h
index 2e14e893a3..c6a7cc28b9 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_complex.h
@@ -95,7 +95,7 @@ __device__ __host__ static inline double hipCabs(hipDoubleComplex z) { return cu
 
 typedef cuFloatComplex hipComplex;
 
-__device__ __host__ static inline hipComplex make_Complex(float x, float y) {
+__device__ __host__ static inline hipComplex make_hipComplex(float x, float y) {
     return make_cuComplex(x, y);
 }
 

From f6ec01a5f2662c7606a25d044c36a7efd44461c3 Mon Sep 17 00:00:00 2001
From: taosang2 <tao.sang@amd.com>
Date: Thu, 1 Jun 2023 18:31:48 -0400
Subject: [PATCH 094/177] =?UTF-8?q?SWDEV-368553=20=E2=80=93=20Add=20missin?=
 =?UTF-8?q?g=20mipmap=20Apis?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add missing mipmap Apis’ implementation.
Fix some bugs of mimpmap apis.
Use hipmipmappedArray to differentiate cuda
and driver apis on Nvidia.
Change-Id: I6079d9f3b2ddf4e42b9a6f7f3902322cfca02cfd
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 27 +++++++++++++++----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index f2033be250..75ca1c1367 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -253,7 +253,8 @@ inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
 
 #define hipTexRef CUtexref
 #define hiparray CUarray
-typedef CUmipmappedArray hipMipmappedArray_t;
+typedef CUmipmappedArray hipmipmappedArray;
+typedef cudaMipmappedArray_t hipMipmappedArray_t;
 
 #define HIP_TRSA_OVERRIDE_FORMAT        CU_TRSA_OVERRIDE_FORMAT
 #define HIP_TRSF_READ_AS_INTEGER        CU_TRSF_READ_AS_INTEGER
@@ -1579,22 +1580,38 @@ inline static hipError_t hipFreeArray(hipArray* array) {
     return hipCUDAErrorTohipError(cudaFreeArray(array));
 }
 
-inline static hipError_t hipMipmappedArrayCreate(hipMipmappedArray_t* pHandle,
+inline static hipError_t hipMipmappedArrayCreate(hipmipmappedArray* pHandle,
                                                  HIP_ARRAY3D_DESCRIPTOR* pMipmappedArrayDesc,
                                                  unsigned int numMipmapLevels) {
     return hipCUResultTohipError(cuMipmappedArrayCreate(pHandle, pMipmappedArrayDesc, numMipmapLevels));
 }
 
-inline static hipError_t hipMipmappedArrayDestroy(hipMipmappedArray_t hMipmappedArray) {
+inline static hipError_t hipMipmappedArrayDestroy(hipmipmappedArray hMipmappedArray) {
     return hipCUResultTohipError(cuMipmappedArrayDestroy(hMipmappedArray));
 }
 
-inline static hipError_t hipMipmappedArrayGetLevel(hipArray_t* pLevelArray,
-                                                   hipMipmappedArray_t hMipMappedArray,
+inline static hipError_t hipMipmappedArrayGetLevel(hiparray* pLevelArray,
+                                                   hipmipmappedArray hMipMappedArray,
                                                    unsigned int level) {
     return hipCUResultTohipError(cuMipmappedArrayGetLevel((CUarray*)pLevelArray, hMipMappedArray, level));
 }
 
+inline static hipError_t hipMallocMipmappedArray(hipMipmappedArray_t* pHandle,
+                                                 const hipChannelFormatDesc* desc, hipExtent extent,
+                                                 unsigned int numLevels, unsigned int flags = 0) {
+    return hipCUDAErrorTohipError(cudaMallocMipmappedArray(pHandle, desc, extent, numLevels, flags));
+}
+
+inline static hipError_t hipFreeMipmappedArray(hipMipmappedArray_t hMipmappedArray) {
+    return hipCUDAErrorTohipError(cudaFreeMipmappedArray(hMipmappedArray));
+}
+
+inline static hipError_t hipGetMipmappedArrayLevel(hipArray_t* pLevelArray,
+                                                   hipMipmappedArray_t hMipMappedArray,
+                                                   unsigned int level) {
+    return hipCUDAErrorTohipError(cudaGetMipmappedArrayLevel(pLevelArray, hMipMappedArray, level));
+}
+
 inline static hipError_t hipHostGetDevicePointer(void** devPtr, void* hostPtr, unsigned int flags) {
     return hipCUDAErrorTohipError(cudaHostGetDevicePointer(devPtr, hostPtr, flags));
 }

From d26dff58a0fb680ae4eab89f3ee313d5abb0d36b Mon Sep 17 00:00:00 2001
From: taosang2 <tao.sang@amd.com>
Date: Fri, 23 Jun 2023 14:20:43 -0400
Subject: [PATCH 095/177] SWDEV-368553 - Fix C build failure

Change-Id: I56d760fa6cf8544100e3bcf3d35129bd16d8a42f
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 75ca1c1367..a6331d9fc9 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1598,7 +1598,7 @@ inline static hipError_t hipMipmappedArrayGetLevel(hiparray* pLevelArray,
 
 inline static hipError_t hipMallocMipmappedArray(hipMipmappedArray_t* pHandle,
                                                  const hipChannelFormatDesc* desc, hipExtent extent,
-                                                 unsigned int numLevels, unsigned int flags = 0) {
+                                                 unsigned int numLevels, unsigned int flags __dparm(0)) {
     return hipCUDAErrorTohipError(cudaMallocMipmappedArray(pHandle, desc, extent, numLevels, flags));
 }
 

From c15ba5c755e86ee0d1aaf6f74e69ef27c1fb2e1c Mon Sep 17 00:00:00 2001
From: Jatin Chaudhary <JatinJaikishan.Chaudhary@amd.com>
Date: Mon, 24 Jul 2023 19:28:08 +0100
Subject: [PATCH 096/177] SWDEV-264166 - [ABI Break] goodbye gcnArch, hello
 gcnArchName

Change-Id: I4b31284243a801f92f0be9468c25db7d027d9a13
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a6331d9fc9..72e2c5271e 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2034,7 +2034,6 @@ inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int dev
     p_prop->maxSharedMemoryPerMultiProcessor = cdprop.sharedMemPerMultiprocessor;
     p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard;
     p_prop->canMapHostMemory = cdprop.canMapHostMemory;
-    p_prop->gcnArch = 0; // Not a GCN arch
     p_prop->integrated = cdprop.integrated;
     p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
     p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;

From 0d3dbe1a1d90c893d95b0efb07894ec2f8ff4c11 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <Maneesh.Gupta@amd.com>
Date: Mon, 31 Jul 2023 08:53:58 -0400
Subject: [PATCH 097/177] Revert "SWDEV-264166 - [ABI Break] goodbye gcnArch,
 hello gcnArchName"

This reverts commit c15ba5c755e86ee0d1aaf6f74e69ef27c1fb2e1c.

Change-Id: Ia7b9ca115be4f9591b195194eed967fc6c330012
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 72e2c5271e..a6331d9fc9 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2034,6 +2034,7 @@ inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int dev
     p_prop->maxSharedMemoryPerMultiProcessor = cdprop.sharedMemPerMultiprocessor;
     p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard;
     p_prop->canMapHostMemory = cdprop.canMapHostMemory;
+    p_prop->gcnArch = 0; // Not a GCN arch
     p_prop->integrated = cdprop.integrated;
     p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
     p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;

From cd8a0b2496f44f775557dc00e499b9ec9cb2e217 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Wed, 2 Aug 2023 13:36:35 +0000
Subject: [PATCH 098/177] Github-3225 - Fix hip on cuda build issue with
 -default-stream=per-thread

Change-Id: I0485891c265617c213964f4130e90fbca9d801d4
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a6331d9fc9..dc746de8e6 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3525,7 +3525,7 @@ inline static hipError_t hipStreamGetCaptureInfo(hipStream_t stream,
     return hipCUDAErrorTohipError(cudaStreamGetCaptureInfo(stream, pCaptureStatus, pId));
 }
 
-#if CUDA_VERSION >= CUDA_11030
+#if CUDA_VERSION >= CUDA_11030 || defined(__CUDA_API_VERSION_INTERNAL)
 inline static hipError_t hipStreamGetCaptureInfo_v2(
     hipStream_t stream, hipStreamCaptureStatus* captureStatus_out,
     unsigned long long* id_out __dparm(0), hipGraph_t* graph_out __dparm(0),

From 2eb5e93d26af0179d11fdebc225a3807ce1eb208 Mon Sep 17 00:00:00 2001
From: Jatin Chaudhary <JatinJaikishan.Chaudhary@amd.com>
Date: Thu, 21 Sep 2023 11:09:15 +0100
Subject: [PATCH 099/177] SWDEV-422808 - Add E5M2 and E4M3 for nvidia headers

Change-Id: Ib40e75c1b2e18d75164607a46e95fc322363c08e
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 54 ++++++++++---------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index dc746de8e6..11c4d03862 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -96,34 +96,36 @@ typedef enum hipMemoryAdvise {
 
 // hipDataType
 #define hipDataType cudaDataType
-#define HIP_R_16F  CUDA_R_16F
-#define HIP_C_16F  CUDA_C_16F
+#define HIP_R_16F CUDA_R_16F
+#define HIP_C_16F CUDA_C_16F
 #define HIP_R_16BF CUDA_R_16BF
 #define HIP_C_16BF CUDA_C_16BF
-#define HIP_R_32F  CUDA_R_32F
-#define HIP_C_32F  CUDA_C_32F
-#define HIP_R_64F  CUDA_R_64F
-#define HIP_C_64F  CUDA_C_64F
-#define HIP_R_4I   CUDA_R_4I
-#define HIP_C_4I   CUDA_C_4I
-#define HIP_R_4U   CUDA_R_4U
-#define HIP_C_4U   CUDA_C_4U
-#define HIP_R_8I   CUDA_R_8I
-#define HIP_C_8I   CUDA_C_8I
-#define HIP_R_8U   CUDA_R_8U
-#define HIP_C_8U   CUDA_C_8U
-#define HIP_R_16I  CUDA_R_16I
-#define HIP_C_16I  CUDA_C_16I
-#define HIP_R_16U  CUDA_R_16U
-#define HIP_C_16U  CUDA_C_16U
-#define HIP_R_32I  CUDA_R_32I
-#define HIP_C_32I  CUDA_C_32I
-#define HIP_R_32U  CUDA_R_32U
-#define HIP_C_32U  CUDA_C_32U
-#define HIP_R_64I  CUDA_R_64I
-#define HIP_C_64I  CUDA_C_64I
-#define HIP_R_64U  CUDA_R_64U
-#define HIP_C_64U  CUDA_C_64U
+#define HIP_R_32F CUDA_R_32F
+#define HIP_C_32F CUDA_C_32F
+#define HIP_R_64F CUDA_R_64F
+#define HIP_C_64F CUDA_C_64F
+#define HIP_R_4I CUDA_R_4I
+#define HIP_C_4I CUDA_C_4I
+#define HIP_R_4U CUDA_R_4U
+#define HIP_C_4U CUDA_C_4U
+#define HIP_R_8I CUDA_R_8I
+#define HIP_C_8I CUDA_C_8I
+#define HIP_R_8U CUDA_R_8U
+#define HIP_C_8U CUDA_C_8U
+#define HIP_R_16I CUDA_R_16I
+#define HIP_C_16I CUDA_C_16I
+#define HIP_R_16U CUDA_R_16U
+#define HIP_C_16U CUDA_C_16U
+#define HIP_R_32I CUDA_R_32I
+#define HIP_C_32I CUDA_C_32I
+#define HIP_R_32U CUDA_R_32U
+#define HIP_C_32U CUDA_C_32U
+#define HIP_R_64I CUDA_R_64I
+#define HIP_C_64I CUDA_C_64I
+#define HIP_R_64U CUDA_R_64U
+#define HIP_C_64U CUDA_C_64U
+#define HIP_R_8F_E4M3 CUDA_R_8F_E4M3
+#define HIP_R_8F_E5M2 CUDA_R_8F_E5M2
 
 // hip stream operation masks
 #define STREAM_OPS_WAIT_MASK_32 0xFFFFFFFF

From cc83fa5c4607b37b0b27aa2d290f1e0c6e988a58 Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Thu, 7 Sep 2023 17:08:31 +0100
Subject: [PATCH 100/177] SWDEV-420822 - [ABI Break] Merge
 hipFunction_attribute into hipFuncAttribute

Change-Id: I9b9f7979e9b8dd422d9064c17c79a25bf7248d30
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 11c4d03862..c7508ba716 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -444,7 +444,7 @@ typedef struct cudaArray* hipArray_const_t;
 typedef struct cudaFuncAttributes hipFuncAttributes;
 typedef struct cudaLaunchParams hipLaunchParams;
 typedef CUDA_LAUNCH_PARAMS hipFunctionLaunchParams;
-#define hipFunction_attribute CUfunction_attribute
+#define hipFuncAttribute  CUfunction_attribute
 #define hipPointer_attribute CUpointer_attribute
 #define hip_Memcpy2D CUDA_MEMCPY2D
 #define HIP_MEMCPY3D CUDA_MEMCPY3D
@@ -2764,7 +2764,7 @@ inline static hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const voi
     return hipCUDAErrorTohipError(cudaFuncGetAttributes(attr, func));
 }
 
-inline static hipError_t hipFuncGetAttribute (int* value, hipFunction_attribute attrib, hipFunction_t hfunc) {
+inline static hipError_t hipFuncGetAttribute (int* value, hipFuncAttribute  attrib, hipFunction_t hfunc) {
     return hipCUResultTohipError(cuFuncGetAttribute(value, attrib, hfunc));
 }
 

From 0a914daa1c6728788e0260e464fbc345f40ffc41 Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Tue, 3 Oct 2023 11:22:29 +0100
Subject: [PATCH 101/177] SWDEV-425129 - Revert "SWDEV-420822 - [ABI Break]
 Merge hipFunction_attribute into hipFuncAttribute"

This reverts commit cc83fa5c4607b37b0b27aa2d290f1e0c6e988a58.

Change-Id: I02d6e77f0874fc37f22de267435917edd4fdede3
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index c7508ba716..11c4d03862 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -444,7 +444,7 @@ typedef struct cudaArray* hipArray_const_t;
 typedef struct cudaFuncAttributes hipFuncAttributes;
 typedef struct cudaLaunchParams hipLaunchParams;
 typedef CUDA_LAUNCH_PARAMS hipFunctionLaunchParams;
-#define hipFuncAttribute  CUfunction_attribute
+#define hipFunction_attribute CUfunction_attribute
 #define hipPointer_attribute CUpointer_attribute
 #define hip_Memcpy2D CUDA_MEMCPY2D
 #define HIP_MEMCPY3D CUDA_MEMCPY3D
@@ -2764,7 +2764,7 @@ inline static hipError_t hipFuncGetAttributes(hipFuncAttributes* attr, const voi
     return hipCUDAErrorTohipError(cudaFuncGetAttributes(attr, func));
 }
 
-inline static hipError_t hipFuncGetAttribute (int* value, hipFuncAttribute  attrib, hipFunction_t hfunc) {
+inline static hipError_t hipFuncGetAttribute (int* value, hipFunction_attribute attrib, hipFunction_t hfunc) {
     return hipCUResultTohipError(cuFuncGetAttribute(value, attrib, hfunc));
 }
 

From 2e8acab3819e5c4f18b2f87b3ca4e848b032a628 Mon Sep 17 00:00:00 2001
From: taosang2 <tao.sang@amd.com>
Date: Thu, 10 Aug 2023 11:49:48 -0400
Subject: [PATCH 102/177] SWDEV-299127 - Support External Mipmap

Support hipExternalMemoryGetMappedMipmappedArray().
Add ImageExternalBuffer to differiate ImageBuffer.
Currently we only support tiling_optimal mode as
vulkan driver doesn't provide tiling information.

Change-Id: I7e3524cdde53e4df9f728894bcebf4bd3f58d4d9
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 11c4d03862..c340cb0b01 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1284,6 +1284,7 @@ typedef enum cudaExternalMemoryHandleType hipExternalMemoryHandleType;
 typedef struct cudaExternalMemoryHandleDesc hipExternalMemoryHandleDesc;
 typedef struct cudaExternalMemoryBufferDesc hipExternalMemoryBufferDesc;
 typedef cudaExternalMemory_t hipExternalMemory_t;
+typedef cudaExternalMemoryMipmappedArrayDesc hipExternalMemoryMipmappedArrayDesc;
 
 typedef enum cudaExternalSemaphoreHandleType hipExternalSemaphoreHandleType;
 #define hipExternalSemaphoreHandleTypeOpaqueFd cudaExternalSemaphoreHandleTypeOpaqueFd
@@ -2944,6 +2945,14 @@ inline static hipError_t hipExternalMemoryGetMappedBuffer(void **devPtr, hipExte
   return hipCUDAErrorTohipError(cudaExternalMemoryGetMappedBuffer(devPtr, extMem, (const struct cudaExternalMemoryBufferDesc*)bufferDesc));
 }
 
+inline static hipError_t hipExternalMemoryGetMappedMipmappedArray(
+    hipMipmappedArray_t* mipmap, hipExternalMemory_t extMem,
+    const hipExternalMemoryMipmappedArrayDesc* mipmapDesc) {
+  return hipCUDAErrorTohipError(cudaExternalMemoryGetMappedMipmappedArray(
+      (cudaMipmappedArray_t*)mipmap, (cudaExternalMemory_t)extMem,
+      (const struct cudaExternalMemoryMipmappedArrayDesc*)mipmapDesc));
+}
+
 inline static hipError_t hipDestroyExternalMemory(hipExternalMemory_t extMem) {
   return hipCUDAErrorTohipError(cudaDestroyExternalMemory(extMem));
 }

From 0ca6451c91d8af938e34f09eec315a75d820592f Mon Sep 17 00:00:00 2001
From: Jatin Chaudhary <JatinJaikishan.Chaudhary@amd.com>
Date: Wed, 2 Aug 2023 13:34:19 +0100
Subject: [PATCH 103/177] SWDEV-306642 - [ABI Break] Add texture/surface/device
 capabilities device struct entries

- alias hipGetDeviceProperties to hipGetDevicePropertiesR0600
- alias hipDeviceProp_t to hipDeviceProp_tR0600
- remove gcnArch from new device property struct
- add new requested struct members

Change-Id: If3f5dbef3d608487d9f6f419285f4bf577ea9bf0
---
 .../hip/nvidia_detail/nvidia_hip_runtime.h    |  22 +--
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 175 ++++++++++++------
 2 files changed, 128 insertions(+), 69 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
index c63e35700b..eabce14fa7 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime.h
@@ -32,11 +32,11 @@ THE SOFTWARE.
 typedef int hipLaunchParm;
 
 #define hipLaunchKernelGGLInternal(kernelName, numBlocks, numThreads, memPerBlock, streamId, ...)  \
-    do {                                                                                           \
-        kernelName<<<numBlocks, numThreads, memPerBlock, streamId>>>(__VA_ARGS__);                 \
-    } while (0)
+  do {                                                                                             \
+    kernelName<<<numBlocks, numThreads, memPerBlock, streamId>>>(__VA_ARGS__);                     \
+  } while (0)
 
-#define hipLaunchKernelGGL(kernelName, ...)  hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
+#define hipLaunchKernelGGL(kernelName, ...) hipLaunchKernelGGLInternal((kernelName), __VA_ARGS__)
 
 #define hipReadModeElementType cudaReadModeElementType
 
@@ -105,15 +105,15 @@ typedef int hipLaunchParm;
 #define HIP_DYNAMIC_SHARED_ATTRIBUTE
 
 #ifdef __HIP_DEVICE_COMPILE__
-#define abort_()                                                                                    \
-    { asm("trap;"); }
+#define abort_()                                                                                   \
+  { asm("trap;"); }
 #undef assert
 #define assert(COND)                                                                               \
-    {                                                                                              \
-        if (!COND) {                                                                               \
-            abort_();                                                                               \
-        }                                                                                          \
-    }
+  {                                                                                                \
+    if (!COND) {                                                                                   \
+      abort_();                                                                                    \
+    }                                                                                              \
+  }
 #endif
 
 #define __clock() clock()
diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index c340cb0b01..89ae35428c 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1980,87 +1980,143 @@ inline static hipError_t hipMemset3DAsync(hipPitchedPtr pitchedDevPtr, int  valu
 }
 
 inline static hipError_t hipGetDeviceProperties(hipDeviceProp_t* p_prop, int device) {
-
     if (p_prop == NULL) {
-      return hipErrorInvalidValue;
+       return hipErrorInvalidValue;
     }
 
     struct cudaDeviceProp cdprop;
-    cudaError_t cerror;
-    cerror = cudaGetDeviceProperties(&cdprop, device);
+    hipError_t error = hipCUDAErrorTohipError(cudaGetDeviceProperties(&cdprop, device));
+
+    if (error != hipSuccess) {
+       return error;
+    }
 
     strncpy(p_prop->name, cdprop.name, 256);
+    strncpy(p_prop->uuid.bytes, cdprop.uuid.bytes, 16);
+    strncpy(p_prop->luid, cdprop.luid, 8);
+    p_prop->luidDeviceNodeMask = cdprop.luidDeviceNodeMask;
     p_prop->totalGlobalMem = cdprop.totalGlobalMem;
     p_prop->sharedMemPerBlock = cdprop.sharedMemPerBlock;
     p_prop->regsPerBlock = cdprop.regsPerBlock;
-    p_prop->warpSize = cdprop.warpSize;
+    p_prop->memPitch = cdprop.memPitch;
     p_prop->maxThreadsPerBlock = cdprop.maxThreadsPerBlock;
-    for (int i = 0; i < 3; i++) {
-        p_prop->maxThreadsDim[i] = cdprop.maxThreadsDim[i];
-        p_prop->maxGridSize[i] = cdprop.maxGridSize[i];
-    }
+    p_prop->maxThreadsDim[0] = cdprop.maxThreadsDim[0];
+    p_prop->maxThreadsDim[1] = cdprop.maxThreadsDim[1];
+    p_prop->maxThreadsDim[2] = cdprop.maxThreadsDim[2];
+    p_prop->maxGridSize[0] = cdprop.maxGridSize[0];
+    p_prop->maxGridSize[1] = cdprop.maxGridSize[1];
+    p_prop->maxGridSize[2] = cdprop.maxGridSize[2];
     p_prop->clockRate = cdprop.clockRate;
-    p_prop->memoryClockRate = cdprop.memoryClockRate;
-    p_prop->memoryBusWidth = cdprop.memoryBusWidth;
     p_prop->totalConstMem = cdprop.totalConstMem;
     p_prop->major = cdprop.major;
     p_prop->minor = cdprop.minor;
+    p_prop->textureAlignment = cdprop.textureAlignment;
+    p_prop->texturePitchAlignment = cdprop.texturePitchAlignment;
+    p_prop->deviceOverlap = cdprop.deviceOverlap;
     p_prop->multiProcessorCount = cdprop.multiProcessorCount;
-    p_prop->l2CacheSize = cdprop.l2CacheSize;
-    p_prop->maxThreadsPerMultiProcessor = cdprop.maxThreadsPerMultiProcessor;
-    p_prop->computeMode = cdprop.computeMode;
-    p_prop->clockInstructionRate = cdprop.clockRate; // Same as clock-rate:
-
-    int ccVers = p_prop->major * 100 + p_prop->minor * 10;
-    p_prop->arch.hasGlobalInt32Atomics = (ccVers >= 110);
-    p_prop->arch.hasGlobalFloatAtomicExch = (ccVers >= 110);
-    p_prop->arch.hasSharedInt32Atomics = (ccVers >= 120);
-    p_prop->arch.hasSharedFloatAtomicExch = (ccVers >= 120);
-    p_prop->arch.hasFloatAtomicAdd = (ccVers >= 200);
-    p_prop->arch.hasGlobalInt64Atomics = (ccVers >= 120);
-    p_prop->arch.hasSharedInt64Atomics = (ccVers >= 110);
-    p_prop->arch.hasDoubles = (ccVers >= 130);
-    p_prop->arch.hasWarpVote = (ccVers >= 120);
-    p_prop->arch.hasWarpBallot = (ccVers >= 200);
-    p_prop->arch.hasWarpShuffle = (ccVers >= 300);
-    p_prop->arch.hasFunnelShift = (ccVers >= 350);
-    p_prop->arch.hasThreadFenceSystem = (ccVers >= 200);
-    p_prop->arch.hasSyncThreadsExt = (ccVers >= 200);
-    p_prop->arch.hasSurfaceFuncs = (ccVers >= 200);
-    p_prop->arch.has3dGrid = (ccVers >= 200);
-    p_prop->arch.hasDynamicParallelism = (ccVers >= 350);
-
-    p_prop->concurrentKernels = cdprop.concurrentKernels;
-    p_prop->pciDomainID = cdprop.pciDomainID;
-    p_prop->pciBusID = cdprop.pciBusID;
-    p_prop->pciDeviceID = cdprop.pciDeviceID;
-    p_prop->maxSharedMemoryPerMultiProcessor = cdprop.sharedMemPerMultiprocessor;
-    p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard;
-    p_prop->canMapHostMemory = cdprop.canMapHostMemory;
-    p_prop->gcnArch = 0; // Not a GCN arch
+    p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
     p_prop->integrated = cdprop.integrated;
-    p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
-    p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
-    p_prop->cooperativeMultiDeviceUnmatchedFunc = 0;
-    p_prop->cooperativeMultiDeviceUnmatchedGridDim = 0;
-    p_prop->cooperativeMultiDeviceUnmatchedBlockDim = 0;
-    p_prop->cooperativeMultiDeviceUnmatchedSharedMem = 0;
-
-    p_prop->maxTexture1D    = cdprop.maxTexture1D;
+    p_prop->canMapHostMemory = cdprop.canMapHostMemory;
+    p_prop->computeMode = cdprop.computeMode;
+    p_prop->maxTexture1D = cdprop.maxTexture1D;
+    p_prop->maxTexture1DMipmap = cdprop.maxTexture1DMipmap;
+    p_prop->maxTexture1DLinear = cdprop.maxTexture1DLinear;
     p_prop->maxTexture2D[0] = cdprop.maxTexture2D[0];
     p_prop->maxTexture2D[1] = cdprop.maxTexture2D[1];
+    p_prop->maxTexture2DMipmap[0] = cdprop.maxTexture2DMipmap[0];
+    p_prop->maxTexture2DMipmap[1] = cdprop.maxTexture2DMipmap[1];
+    p_prop->maxTexture2DLinear[0] = cdprop.maxTexture2DLinear[0];
+    p_prop->maxTexture2DLinear[1] = cdprop.maxTexture2DLinear[1];
+    p_prop->maxTexture2DLinear[2] = cdprop.maxTexture2DLinear[2];
+    p_prop->maxTexture2DGather[0] = cdprop.maxTexture2DGather[0];
+    p_prop->maxTexture2DGather[1] = cdprop.maxTexture2DGather[1];
     p_prop->maxTexture3D[0] = cdprop.maxTexture3D[0];
     p_prop->maxTexture3D[1] = cdprop.maxTexture3D[1];
     p_prop->maxTexture3D[2] = cdprop.maxTexture3D[2];
+    p_prop->maxTexture3DAlt[0] = cdprop.maxTexture3DAlt[0];
+    p_prop->maxTexture3DAlt[1] = cdprop.maxTexture3DAlt[1];
+    p_prop->maxTexture3DAlt[2] = cdprop.maxTexture3DAlt[2];
+    p_prop->maxTextureCubemap = cdprop.maxTextureCubemap;
+    p_prop->maxTexture1DLayered[0] = cdprop.maxTexture1DLayered[0];
+    p_prop->maxTexture1DLayered[1] = cdprop.maxTexture1DLayered[1];
+    p_prop->maxTexture2DLayered[0] = cdprop.maxTexture2DLayered[0];
+    p_prop->maxTexture2DLayered[1] = cdprop.maxTexture2DLayered[1];
+    p_prop->maxTexture2DLayered[2] = cdprop.maxTexture2DLayered[2];
+    p_prop->maxTextureCubemapLayered[0] = cdprop.maxTextureCubemapLayered[0];
+    p_prop->maxTextureCubemapLayered[1] = cdprop.maxTextureCubemapLayered[1];
+    p_prop->maxSurface1D = cdprop.maxSurface1D;
+    p_prop->maxSurface2D[0] = cdprop.maxSurface2D[0];
+    p_prop->maxSurface2D[1] = cdprop.maxSurface2D[1];
+    p_prop->maxSurface3D[0] = cdprop.maxSurface3D[0];
+    p_prop->maxSurface3D[1] = cdprop.maxSurface3D[1];
+    p_prop->maxSurface3D[2] = cdprop.maxSurface3D[2];
+    p_prop->maxSurface1DLayered[0] = cdprop.maxSurface1DLayered[0];
+    p_prop->maxSurface1DLayered[1] = cdprop.maxSurface1DLayered[1];
+    p_prop->maxSurface2DLayered[0] = cdprop.maxSurface2DLayered[0];
+    p_prop->maxSurface2DLayered[1] = cdprop.maxSurface2DLayered[1];
+    p_prop->maxSurface2DLayered[2] = cdprop.maxSurface2DLayered[2];
+    p_prop->maxSurfaceCubemap = cdprop.maxSurfaceCubemap;
+    p_prop->maxSurfaceCubemapLayered[0] = cdprop.maxSurfaceCubemapLayered[0];
+    p_prop->maxSurfaceCubemapLayered[1] = cdprop.maxSurfaceCubemapLayered[1];
+    p_prop->surfaceAlignment = cdprop.surfaceAlignment;
+    p_prop->concurrentKernels = cdprop.concurrentKernels;
+    p_prop->ECCEnabled = cdprop.ECCEnabled;
+    p_prop->pciBusID = cdprop.pciBusID;
+    p_prop->pciDeviceID = cdprop.pciDeviceID;
+    p_prop->pciDomainID = cdprop.pciDomainID;
+    p_prop->tccDriver = cdprop.tccDriver;
+    p_prop->asyncEngineCount = cdprop.asyncEngineCount;
+    p_prop->unifiedAddressing = cdprop.unifiedAddressing;
+    p_prop->memoryClockRate = cdprop.memoryClockRate;
+    p_prop->memoryBusWidth = cdprop.memoryBusWidth;
+    p_prop->l2CacheSize = cdprop.l2CacheSize;
+    p_prop->maxThreadsPerMultiProcessor = cdprop.maxThreadsPerMultiProcessor;
+    p_prop->streamPrioritiesSupported = cdprop.streamPrioritiesSupported;
+    p_prop->globalL1CacheSupported = cdprop.globalL1CacheSupported;
+    p_prop->localL1CacheSupported = cdprop.localL1CacheSupported;
+    p_prop->sharedMemPerMultiprocessor = cdprop.sharedMemPerMultiprocessor;
+    p_prop->regsPerMultiprocessor = cdprop.regsPerMultiprocessor;
+    p_prop->managedMemory = cdprop.managedMemory;
+    p_prop->isMultiGpuBoard = cdprop.isMultiGpuBoard;
+    p_prop->multiGpuBoardGroupID = cdprop.multiGpuBoardGroupID;
+    p_prop->hostNativeAtomicSupported = cdprop.hostNativeAtomicSupported;
+    p_prop->singleToDoublePrecisionPerfRatio = cdprop.singleToDoublePrecisionPerfRatio;
+    p_prop->pageableMemoryAccess = cdprop.pageableMemoryAccess;
+    p_prop->concurrentManagedAccess = cdprop.concurrentManagedAccess;
+    p_prop->computePreemptionSupported = cdprop.computePreemptionSupported;
+    p_prop->canUseHostPointerForRegisteredMem = cdprop.canUseHostPointerForRegisteredMem;
+    p_prop->cooperativeLaunch = cdprop.cooperativeLaunch;
+    p_prop->cooperativeMultiDeviceLaunch = cdprop.cooperativeMultiDeviceLaunch;
+    p_prop->sharedMemPerBlockOptin = cdprop.sharedMemPerBlockOptin;
+    p_prop->pageableMemoryAccessUsesHostPageTables = cdprop.pageableMemoryAccessUsesHostPageTables;
+    p_prop->directManagedMemAccessFromHost = cdprop.directManagedMemAccessFromHost;
 
-    p_prop->memPitch                 = cdprop.memPitch;
-    p_prop->textureAlignment         = cdprop.textureAlignment;
-    p_prop->texturePitchAlignment    = cdprop.texturePitchAlignment;
-    p_prop->kernelExecTimeoutEnabled = cdprop.kernelExecTimeoutEnabled;
-    p_prop->ECCEnabled               = cdprop.ECCEnabled;
-    p_prop->tccDriver                = cdprop.tccDriver;
 
-    return hipCUDAErrorTohipError(cerror);
+#if CUDA_VERSION >= 11010
+    p_prop->accessPolicyMaxWindowSize = cdprop.accessPolicyMaxWindowSize;
+    p_prop->maxBlocksPerMultiProcessor = cdprop.maxBlocksPerMultiProcessor;
+    p_prop->persistingL2CacheMaxSize = cdprop.persistingL2CacheMaxSize;
+    p_prop->reservedSharedMemPerBlock = cdprop.reservedSharedMemPerBlock;
+    p_prop->warpSize = cdprop.warpSize;
+#endif
+
+#if CUDA_VERSION >= 12000
+    p_prop->clusterLaunch = cdprop.clusterLaunch;
+    p_prop->deferredMappingHipArraySupported = cdprop.deferredMappingCudaArraySupported;
+    p_prop->gpuDirectRDMAFlushWritesOptions = cdprop.gpuDirectRDMAFlushWritesOptions;
+    p_prop->gpuDirectRDMASupported = cdprop.gpuDirectRDMASupported;
+    p_prop->gpuDirectRDMAWritesOrdering = cdprop.gpuDirectRDMAWritesOrdering;
+    p_prop->hostRegisterReadOnlySupported = cdprop.hostRegisterReadOnlySupported;
+    p_prop->hostRegisterSupported = cdprop.hostRegisterSupported;
+    p_prop->ipcEventSupported = cdprop.ipcEventSupported;
+    p_prop->memoryPoolSupportedHandleTypes = cdprop.memoryPoolSupportedHandleTypes;
+    p_prop->memoryPoolsSupported = cdprop.memoryPoolsSupported;
+    p_prop->sparseHipArraySupported = cdprop.sparseCudaArraySupported;
+    p_prop->timelineSemaphoreInteropSupported = cdprop.timelineSemaphoreInteropSupported;
+    p_prop->unifiedFunctionPointers = cdprop.unifiedFunctionPointers;
+#endif
+
+    return error;
 }
 
 inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device) {
@@ -2188,6 +2244,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
         case hipDeviceAttributeCooperativeMultiDeviceLaunch:
             cdattr = cudaDevAttrCooperativeMultiDeviceLaunch;
             break;
+        case hipDeviceAttributeHostRegisterSupported:
+            cdattr = cudaDevAttrHostRegisterSupported;
+            break;
         case hipDeviceAttributeConcurrentManagedAccess:
             cdattr = cudaDevAttrConcurrentManagedAccess;
             break;

From 452c1060f3efffa77b902cca4710e7418fc003c6 Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Tue, 25 Jul 2023 14:29:18 -0400
Subject: [PATCH 104/177] SWDEV-332969 - [ABI Break]Substitute hipArray* with
 hipArray_t

- hipArray will be an internal struct from rocm6.0

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: Icf97fe96b87be8532098cd7f9ceaad099f99c9b9
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 292 +++++++++++++++---
 1 file changed, 257 insertions(+), 35 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 89ae35428c..ac387be5d5 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -254,7 +254,6 @@ inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
 #define hipStreamPerThread ((cudaStream_t)2)
 
 #define hipTexRef CUtexref
-#define hiparray CUarray
 typedef CUmipmappedArray hipmipmappedArray;
 typedef cudaMipmappedArray_t hipMipmappedArray_t;
 
@@ -438,7 +437,6 @@ typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr;
 typedef CUmodule hipModule_t;
 typedef CUfunction hipFunction_t;
 typedef CUdeviceptr hipDeviceptr_t;
-typedef struct cudaArray hipArray;
 typedef struct cudaArray* hipArray_t;
 typedef struct cudaArray* hipArray_const_t;
 typedef struct cudaFuncAttributes hipFuncAttributes;
@@ -446,8 +444,209 @@ typedef struct cudaLaunchParams hipLaunchParams;
 typedef CUDA_LAUNCH_PARAMS hipFunctionLaunchParams;
 #define hipFunction_attribute CUfunction_attribute
 #define hipPointer_attribute CUpointer_attribute
-#define hip_Memcpy2D CUDA_MEMCPY2D
-#define HIP_MEMCPY3D CUDA_MEMCPY3D
+
+typedef struct HIP_RESOURCE_DESC_st
+{
+    hipResourcetype  resType;                     /**< Resource type */
+    union {
+        struct {
+            hipArray_t hArray;                   /**< HIP array */
+        } array;
+        struct {
+            hipMipmappedArray_t hMipmappedArray; /**< HIP mipmapped array */
+        } mipmap;
+        struct {
+            hipDeviceptr_t devPtr;               /**< Device pointer */
+            hipArray_Format format;              /**< Array format */
+            unsigned int numChannels;            /**< Channels per array element */
+            size_t sizeInBytes;                  /**< Size in bytes */
+        } linear;
+        struct {
+            hipDeviceptr_t devPtr;               /**< Device pointer */
+            hipArray_Format format;              /**< Array format */
+            unsigned int numChannels;            /**< Channels per array element */
+            size_t width;                        /**< Width of the array in elements */
+            size_t height;                       /**< Height of the array in elements */
+            size_t pitchInBytes;                 /**< Pitch between two rows in bytes */
+        } pitch2D;
+        struct {
+            int reserved[32];
+        } reserved;
+    } res;
+    unsigned int flags;                          /**< Flags (must be zero) */
+} HIP_RESOURCE_DESC;
+
+static inline CUDA_RESOURCE_DESC* hipResourceDesTocudaResourceDes(const HIP_RESOURCE_DESC* p){
+    CUDA_RESOURCE_DESC a;
+    switch (p->resType) {
+        case HIP_RESOURCE_TYPE_ARRAY:
+            a.resType = CU_RESOURCE_TYPE_ARRAY;
+        case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
+            a.resType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+        case HIP_RESOURCE_TYPE_LINEAR:
+            a.resType = CU_RESOURCE_TYPE_LINEAR;
+        case HIP_RESOURCE_TYPE_PITCH2D:
+            a.resType = CU_RESOURCE_TYPE_PITCH2D;
+        default:
+            a.resType = CU_RESOURCE_TYPE_ARRAY;
+    }
+    a.res.array.hArray = (CUarray)p->res.array.hArray;
+    a.res.mipmap.hMipmappedArray = (CUmipmappedArray)p->res.mipmap.hMipmappedArray;
+    a.res.linear.devPtr = p->res.linear.devPtr;
+    a.res.linear.format = p->res.linear.format;
+    a.res.linear.numChannels = p->res.linear.numChannels;
+    a.res.linear.sizeInBytes = p->res.linear.sizeInBytes;
+    a.res.pitch2D.devPtr = p->res.pitch2D.devPtr;
+    a.res.pitch2D.numChannels = p->res.pitch2D.numChannels;
+    a.res.pitch2D.format = p->res.pitch2D.format;
+    a.res.pitch2D.width = p->res.pitch2D.width;
+    a.res.pitch2D.height = p->res.pitch2D.height;
+    a.res.pitch2D.pitchInBytes = p->res.pitch2D.pitchInBytes;
+    a.flags = p->flags;
+    return &a;
+}
+
+typedef struct hip_Memcpy2D {
+    size_t srcXInBytes;
+    size_t srcY;
+    hipMemoryType srcMemoryType;
+    const void* srcHost;
+    hipDeviceptr_t srcDevice;
+    hipArray_t srcArray;
+    size_t srcPitch;
+    size_t dstXInBytes;
+    size_t dstY;
+    hipMemoryType dstMemoryType;
+    void* dstHost;
+    hipDeviceptr_t dstDevice;
+    hipArray_t dstArray;
+    size_t dstPitch;
+    size_t WidthInBytes;
+    size_t Height;
+} hip_Memcpy2D;
+
+typedef struct HIP_MEMCPY3D {
+  unsigned int srcXInBytes;
+  unsigned int srcY;
+  unsigned int srcZ;
+  unsigned int srcLOD;
+  hipMemoryType srcMemoryType;
+  const void* srcHost;
+  hipDeviceptr_t srcDevice;
+  hipArray_t srcArray;
+  unsigned int srcPitch;
+  unsigned int srcHeight;
+  unsigned int dstXInBytes;
+  unsigned int dstY;
+  unsigned int dstZ;
+  unsigned int dstLOD;
+  hipMemoryType dstMemoryType;
+  void* dstHost;
+  hipDeviceptr_t dstDevice;
+  hipArray_t dstArray;
+  unsigned int dstPitch;
+  unsigned int dstHeight;
+  unsigned int WidthInBytes;
+  unsigned int Height;
+  unsigned int Depth;
+} HIP_MEMCPY3D;
+
+static inline void hipMemcpy3DTocudaMemcpy3D(CUDA_MEMCPY3D &a, const HIP_MEMCPY3D* p){
+    a.srcXInBytes = (size_t)p->srcXInBytes;
+    a.srcY = (size_t)p->srcY;
+    a.srcZ = (size_t)p->srcZ;
+    a.srcLOD = (size_t)p->srcLOD;
+    switch (p->srcMemoryType) {
+        case hipMemoryTypeHost:
+            a.srcMemoryType = CU_MEMORYTYPE_HOST;
+            break;
+        case hipMemoryTypeDevice:
+            a.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+            break;
+        case hipMemoryTypeArray:
+            a.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+            break;
+        default:
+            a.srcMemoryType = CU_MEMORYTYPE_UNIFIED;
+    }
+    a.srcHost = p->srcHost;
+    a.srcDevice =(CUdeviceptr)p->srcDevice;
+    a.srcArray = (CUarray)p->srcArray;
+    a.reserved0 = nullptr;
+    a.srcPitch = (size_t)p->srcPitch;
+    a.srcHeight = (size_t)p->srcHeight;
+    a.dstXInBytes = (size_t)p->dstXInBytes;
+    a.dstY = (size_t)p->dstY;
+    a.dstZ = (size_t)p->dstZ;
+    a.dstLOD = (size_t)p->dstLOD;
+    switch (p->dstMemoryType) {
+        case hipMemoryTypeHost:
+            a.dstMemoryType = CU_MEMORYTYPE_HOST;
+            break;
+        case hipMemoryTypeDevice:
+            a.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+            break;
+        case hipMemoryTypeArray:
+            a.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+            break;
+        default:
+            a.dstMemoryType = CU_MEMORYTYPE_UNIFIED;
+    }
+    a.dstHost = p->dstHost;
+    a.dstDevice = (CUdeviceptr)p->dstDevice;
+    a.dstArray = (CUarray)p->dstArray;
+    a.reserved1 = nullptr;
+    a.dstPitch = (size_t)p->dstPitch;
+    a.dstHeight = (size_t)p->dstHeight;
+    a.WidthInBytes = (size_t)p->WidthInBytes;
+    a.Height = (size_t)p->Height;
+    a.Depth = (size_t)p->Depth;
+}
+
+static inline void hipMemcpy2DTocudaMemcpy2D(CUDA_MEMCPY2D &a, const hip_Memcpy2D* p){
+    a.srcXInBytes = (size_t)p->srcXInBytes;
+    a.srcY = (size_t)p->srcY;
+    switch (p->srcMemoryType) {
+        case hipMemoryTypeHost:
+            a.srcMemoryType = CU_MEMORYTYPE_HOST;
+            break;
+        case hipMemoryTypeDevice:
+            a.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+            break;
+        case hipMemoryTypeArray:
+            a.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+            break;
+        default:
+            a.srcMemoryType = CU_MEMORYTYPE_UNIFIED;
+    }
+    a.srcHost = p->srcHost;
+    a.srcDevice = (CUdeviceptr)p->srcDevice;
+    a.srcArray = (CUarray)p->srcArray;
+    a.srcPitch = (size_t)p->srcPitch;
+    a.dstXInBytes = (size_t)p->dstXInBytes;
+    a.dstY = (size_t)p->dstY;
+    switch (p->dstMemoryType) {
+        case hipMemoryTypeHost:
+            a.dstMemoryType = CU_MEMORYTYPE_HOST;
+            break;
+        case hipMemoryTypeDevice:
+            a.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+            break;
+        case hipMemoryTypeArray:
+            a.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+            break;
+        default:
+            a.dstMemoryType = CU_MEMORYTYPE_UNIFIED;
+    }
+    a.dstHost = p->dstHost;
+    a.dstDevice = (CUdeviceptr)p->dstDevice;
+    a.dstArray = (CUarray)p->dstArray;
+    a.dstPitch = (size_t)p->dstPitch;
+    a.WidthInBytes = (size_t)p->WidthInBytes;
+    a.Height = (size_t)p->Height;
+}
+
+
 #define hipMemcpy3DParms cudaMemcpy3DParms
 #define hipArrayDefault cudaArrayDefault
 #define hipArrayLayered cudaArrayLayered
@@ -507,7 +706,6 @@ typedef struct cudaChannelFormatDesc hipChannelFormatDesc;
 typedef struct cudaResourceDesc hipResourceDesc;
 typedef struct cudaTextureDesc hipTextureDesc;
 typedef struct cudaResourceViewDesc hipResourceViewDesc;
-typedef CUDA_RESOURCE_DESC HIP_RESOURCE_DESC;
 typedef CUDA_TEXTURE_DESC HIP_TEXTURE_DESC;
 typedef CUDA_RESOURCE_VIEW_DESC HIP_RESOURCE_VIEW_DESC;
 // adding code for hipmemSharedConfig
@@ -1568,18 +1766,18 @@ inline static hipError_t hipMallocManaged(void** ptr, size_t size, unsigned int
     return hipCUDAErrorTohipError(cudaMallocManaged(ptr, size, flags));
 }
 
-inline static hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
+inline static hipError_t hipMallocArray(hipArray_t* array, const hipChannelFormatDesc* desc,
                                         size_t width, size_t height __dparm(0),
                                         unsigned int flags __dparm(hipArrayDefault)) {
     return hipCUDAErrorTohipError(cudaMallocArray(array, desc, width, height, flags));
 }
 
-inline static hipError_t hipMalloc3DArray(hipArray** array, const hipChannelFormatDesc* desc,
+inline static hipError_t hipMalloc3DArray(hipArray_t* array, const hipChannelFormatDesc* desc,
                              hipExtent extent, unsigned int flags) {
     return hipCUDAErrorTohipError(cudaMalloc3DArray(array, desc, extent, flags));
 }
 
-inline static hipError_t hipFreeArray(hipArray* array) {
+inline static hipError_t hipFreeArray(hipArray_t array) {
     return hipCUDAErrorTohipError(cudaFreeArray(array));
 }
 
@@ -1593,7 +1791,7 @@ inline static hipError_t hipMipmappedArrayDestroy(hipmipmappedArray hMipmappedAr
     return hipCUResultTohipError(cuMipmappedArrayDestroy(hMipmappedArray));
 }
 
-inline static hipError_t hipMipmappedArrayGetLevel(hiparray* pLevelArray,
+inline static hipError_t hipMipmappedArrayGetLevel(hipArray_t* pLevelArray,
                                                    hipmipmappedArray hMipMappedArray,
                                                    unsigned int level) {
     return hipCUResultTohipError(cuMipmappedArrayGetLevel((CUarray*)pLevelArray, hMipMappedArray, level));
@@ -1763,11 +1961,23 @@ inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src,
 }
 
 inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
-  return hipCUResultTohipError(cuMemcpy2D(pCopy));
+  if(pCopy == nullptr) {
+    return hipCUResultTohipError(cuMemcpy2D(nullptr));
+  } else {
+    CUDA_MEMCPY2D cudaCopy = {0};
+    hipMemcpy2DTocudaMemcpy2D(cudaCopy, pCopy);
+    return hipCUResultTohipError(cuMemcpy2D((const CUDA_MEMCPY2D*)&cudaCopy));
+  }
 }
 
 inline static hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0)) {
-  return hipCUResultTohipError(cuMemcpy2DAsync(pCopy, stream));
+  if(pCopy == nullptr) {
+    return hipCUResultTohipError(cuMemcpy2DAsync(nullptr, stream));
+  } else {
+    CUDA_MEMCPY2D cudaCopy = {0};
+    hipMemcpy2DTocudaMemcpy2D(cudaCopy, pCopy);
+    return hipCUResultTohipError(cuMemcpy2DAsync((const CUDA_MEMCPY2D*)&cudaCopy, stream));
+  }
 }
 
 inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) {
@@ -1778,12 +1988,24 @@ inline static hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms *p, hipS
     return hipCUDAErrorTohipError(cudaMemcpy3DAsync(p, stream));
 }
 
-inline static hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pCopy) {
-    return hipCUResultTohipError(cuMemcpy3D(pCopy));
+inline static hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pcopy) {
+    if(pcopy == nullptr) {
+      return hipCUResultTohipError(cuMemcpy3D(nullptr));
+    } else {
+      CUDA_MEMCPY3D cudaCopy = {0};
+      hipMemcpy3DTocudaMemcpy3D(cudaCopy, pcopy);
+      return hipCUResultTohipError(cuMemcpy3D((const CUDA_MEMCPY3D*)&cudaCopy));
+    }
 }
 
-inline static hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D* pCopy, hipStream_t stream) {
-    return hipCUResultTohipError(cuMemcpy3DAsync(pCopy, stream));
+inline static hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D *pcopy, hipStream_t stream) {
+    if(pcopy == nullptr) {
+      return hipCUResultTohipError(cuMemcpy3DAsync(nullptr, stream));
+    } else {
+      CUDA_MEMCPY3D cudaCopy = {0};
+      hipMemcpy3DTocudaMemcpy3D(cudaCopy, pcopy);
+      return hipCUResultTohipError(cuMemcpy3DAsync((const CUDA_MEMCPY3D*)&cudaCopy, stream));
+    }
 }
 
 inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void* src, size_t spitch,
@@ -1793,7 +2015,7 @@ inline static hipError_t hipMemcpy2DAsync(void* dst, size_t dpitch, const void*
                                                     kind, stream));
 }
 
-inline static hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray* src,
+inline static hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray_t src,
                                               size_t wOffset, size_t hOffset, size_t width,
                                               size_t height, hipMemcpyKind kind) {
     return hipCUDAErrorTohipError(cudaMemcpy2DFromArray(dst, dpitch, src, wOffset, hOffset, width,
@@ -1801,7 +2023,7 @@ inline static hipError_t hipMemcpy2DFromArray(void* dst, size_t dpitch, hipArray
                                                         kind));
 }
 
-inline static hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray* src,
+inline static hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hipArray_t src,
                                                    size_t wOffset, size_t hOffset, size_t width,
                                                    size_t height, hipMemcpyKind kind,
                                                    hipStream_t stream) {
@@ -1811,14 +2033,14 @@ inline static hipError_t hipMemcpy2DFromArrayAsync(void* dst, size_t dpitch, hip
                                                              stream));
 }
 
-inline static hipError_t hipMemcpy2DToArray(hipArray* dst, size_t wOffset, size_t hOffset,
+inline static hipError_t hipMemcpy2DToArray(hipArray_t dst, size_t wOffset, size_t hOffset,
                                             const void* src, size_t spitch, size_t width,
                                             size_t height, hipMemcpyKind kind) {
     return hipCUDAErrorTohipError(cudaMemcpy2DToArray(dst, wOffset, hOffset, src, spitch, width,
                                                       height, kind));
 }
 
-inline static hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset, size_t hOffset,
+inline static hipError_t hipMemcpy2DToArrayAsync(hipArray_t dst, size_t wOffset, size_t hOffset,
                                                  const void* src, size_t spitch, size_t width,
                                                  size_t height, hipMemcpyKind kind,
                                                  hipStream_t stream) {
@@ -1828,7 +2050,7 @@ inline static hipError_t hipMemcpy2DToArrayAsync(hipArray* dst, size_t wOffset,
                                                            stream));
 }
 
-__HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray* dst, size_t wOffset,
+__HIP_DEPRECATED inline static hipError_t hipMemcpyToArray(hipArray_t dst, size_t wOffset,
                                                            size_t hOffset, const void* src,
                                                            size_t count, hipMemcpyKind kind) {
     return hipCUDAErrorTohipError(
@@ -1842,12 +2064,12 @@ __HIP_DEPRECATED inline static hipError_t hipMemcpyFromArray(void* dst, hipArray
                                                       kind));
 }
 
-inline static hipError_t hipMemcpyAtoH(void* dst, hipArray* srcArray, size_t srcOffset,
+inline static hipError_t hipMemcpyAtoH(void* dst, hipArray_t srcArray, size_t srcOffset,
                                        size_t count) {
     return hipCUResultTohipError(cuMemcpyAtoH(dst, (CUarray)srcArray, srcOffset, count));
 }
 
-inline static hipError_t hipMemcpyHtoA(hipArray* dstArray, size_t dstOffset, const void* srcHost,
+inline static hipError_t hipMemcpyHtoA(hipArray_t dstArray, size_t dstOffset, const void* srcHost,
                                        size_t count) {
     return hipCUResultTohipError(cuMemcpyHtoA((CUarray)dstArray, dstOffset, srcHost, count));
 }
@@ -3241,7 +3463,7 @@ inline static hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject,
                                             const HIP_RESOURCE_DESC* pResDesc,
                                             const HIP_TEXTURE_DESC* pTexDesc,
                                             const HIP_RESOURCE_VIEW_DESC* pResViewDesc) {
-    return hipCUResultTohipError(cuTexObjectCreate((CUtexObject*)pTexObject, pResDesc, pTexDesc, pResViewDesc));
+    return hipCUResultTohipError(cuTexObjectCreate((CUtexObject*)pTexObject,(CUDA_RESOURCE_DESC*)pResDesc, pTexDesc, pResViewDesc));
 }
 
 inline static hipError_t hipTexObjectDestroy(hipTextureObject_t texObject) {
@@ -3249,7 +3471,7 @@ inline static hipError_t hipTexObjectDestroy(hipTextureObject_t texObject) {
 }
 
 inline static hipError_t hipTexObjectGetResourceDesc(HIP_RESOURCE_DESC* pResDesc, hipTextureObject_t texObject) {
-    return hipCUResultTohipError(cuTexObjectGetResourceDesc(pResDesc, (CUtexObject)texObject));
+    return hipCUResultTohipError(cuTexObjectGetResourceDesc((CUDA_RESOURCE_DESC*)pResDesc, (CUtexObject)texObject));
 }
 
 inline static hipError_t hipTexObjectGetResourceViewDesc(HIP_RESOURCE_VIEW_DESC* pResViewDesc, hipTextureObject_t texObject) {
@@ -3284,35 +3506,35 @@ __HIP_DEPRECATED inline static hipError_t hipTexRefSetFlags(hipTexRef hTexRef, u
     return hipCUResultTohipError(cuTexRefSetFlags(hTexRef,Flags));
 }
 
-__HIP_DEPRECATED inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, hiparray hArray, unsigned int Flags){
-    return hipCUResultTohipError(cuTexRefSetArray(hTexRef,hArray,Flags));
+__HIP_DEPRECATED inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, hipArray_t hArray, unsigned int Flags){
+    return hipCUResultTohipError(cuTexRefSetArray(hTexRef,(CUarray)hArray,Flags));
 }
 
-inline static hipError_t hipArrayCreate(hiparray* pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray){
-    return hipCUResultTohipError(cuArrayCreate(pHandle, pAllocateArray));
+inline static hipError_t hipArrayCreate(hipArray_t* pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray){
+    return hipCUResultTohipError(cuArrayCreate((CUarray*)pHandle, pAllocateArray));
 }
 
-inline static hipError_t hipArrayDestroy(hiparray hArray){
-    return hipCUResultTohipError(cuArrayDestroy(hArray));
+inline static hipError_t hipArrayDestroy(hipArray_t hArray){
+    return hipCUResultTohipError(cuArrayDestroy((CUarray)hArray));
 }
 
-inline static hipError_t hipArray3DCreate(hiparray* pHandle,
+inline static hipError_t hipArray3DCreate(hipArray_t* pHandle,
                                           const HIP_ARRAY3D_DESCRIPTOR* pAllocateArray){
-    return hipCUResultTohipError(cuArray3DCreate(pHandle, pAllocateArray));
+    return hipCUResultTohipError(cuArray3DCreate((CUarray*)pHandle, pAllocateArray));
 }
 
 inline static hipError_t hipArrayGetInfo(hipChannelFormatDesc* desc, hipExtent* extent,
-                                          unsigned int* flags, hipArray* array) {
+                                          unsigned int* flags, hipArray_t array) {
     return hipCUDAErrorTohipError(cudaArrayGetInfo(desc, extent, flags, array));
 }
 
 inline static hipError_t hipArrayGetDescriptor(HIP_ARRAY_DESCRIPTOR* pArrayDescriptor,
-                                               hipArray* array) {
+                                               hipArray_t array) {
     return hipCUResultTohipError(cuArrayGetDescriptor(pArrayDescriptor, (CUarray)array));
 }
 
 inline static hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayDescriptor,
-                                                 hipArray* array) {
+                                                 hipArray_t array) {
     return hipCUResultTohipError(cuArray3DGetDescriptor(pArrayDescriptor, (CUarray)array));
 }
 

From ddb70737bd49ca938a4735c866d439c13385f62b Mon Sep 17 00:00:00 2001
From: Christophe Paquot <christophe.paquot@amd.com>
Date: Fri, 6 Oct 2023 08:47:36 -0700
Subject: [PATCH 105/177] SWDEV-371332 - Convert CUmemorytype to HIP enum

We need to convert the value given back from CUDA to HIP during hipPointerGetAttribute call

Change-Id: Idb19d60971f360a71c7451cfc56cdc56831d262f
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 33 +++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index ac387be5d5..34be02c1b6 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1960,6 +1960,20 @@ inline static hipError_t hipMemcpy2D(void* dst, size_t dpitch, const void* src,
         cudaMemcpy2D(dst, dpitch, src, spitch, width, height, kind));
 }
 
+inline static hipMemoryType getHipMemoryType(CUmemorytype type) {
+    switch (type) {
+        case CU_MEMORYTYPE_HOST:
+            return hipMemoryTypeHost;
+        case CU_MEMORYTYPE_DEVICE:
+            return hipMemoryTypeDevice;
+        case CU_MEMORYTYPE_ARRAY:
+            return hipMemoryTypeArray;
+        case CU_MEMORYTYPE_UNIFIED:
+            return hipMemoryTypeUnified;
+    }
+    return hipMemoryTypeHost;
+}
+
 inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
   if(pCopy == nullptr) {
     return hipCUResultTohipError(cuMemcpy2D(nullptr));
@@ -2727,13 +2741,28 @@ inline static hipError_t hipPointerGetAttributes(hipPointerAttribute_t* attribut
 
 inline static hipError_t hipPointerGetAttribute(void* data, hipPointer_attribute attribute,
                                                 hipDeviceptr_t ptr) {
-    return hipCUResultTohipError(cuPointerGetAttribute(data, attribute, ptr));
+    hipError_t err = hipCUResultTohipError(cuPointerGetAttribute(data, attribute, ptr));
+    if (err == hipSuccess &&
+        attribute == HIP_POINTER_ATTRIBUTE_MEMORY_TYPE &&
+        data != nullptr) {
+        *reinterpret_cast<uint32_t*>(data) = getHipMemoryType(*reinterpret_cast<CUmemorytype*>(data));
+    }
+    return err;
 }
 
 inline static hipError_t hipDrvPointerGetAttributes(unsigned int numAttributes,
                                                     hipPointer_attribute* attributes,
                                                     void** data, hipDeviceptr_t ptr) {
-    return hipCUResultTohipError(cuPointerGetAttributes(numAttributes, attributes, data, ptr));
+    hipError_t err = hipCUResultTohipError(cuPointerGetAttributes(numAttributes, attributes, data, ptr));
+    if (err == hipSuccess && attributes != nullptr) {
+        for(int i = 0; i < numAttributes; i++) {
+          if(attributes[i] == HIP_POINTER_ATTRIBUTE_MEMORY_TYPE) {
+            *reinterpret_cast<uint32_t**>(data)[i] = getHipMemoryType(*reinterpret_cast<CUmemorytype**>(data)[i]);
+            break;
+          }
+        }
+    }
+    return err;
 }
 
 inline static hipError_t hipMemGetInfo(size_t* free, size_t* total) {

From f0cd4f7523ce93432f2080f5b1a614056c8ff916 Mon Sep 17 00:00:00 2001
From: Saleel Kudchadker <Saleel.Kudchadker@amd.com>
Date: Tue, 17 Oct 2023 04:35:59 +0000
Subject: [PATCH 106/177] SWDEV-408180 - Add a new hipMemcpyKind

Header changes to support new hipMemcpyDeviceToDeviceNoCU

Change-Id: I434d5b337ee7b76e930687674098b9488aaa22a9
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 34be02c1b6..218c118682 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -83,6 +83,7 @@ typedef enum cudaMemcpyKind hipMemcpyKind;
 #define hipMemcpyHostToDevice cudaMemcpyHostToDevice
 #define hipMemcpyDeviceToHost cudaMemcpyDeviceToHost
 #define hipMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define hipMemcpyDeviceToDeviceNoCU cudaMemcpyDeviceToDevice
 #define hipMemcpyDefault cudaMemcpyDefault
 
 typedef enum hipMemoryAdvise {
@@ -1368,6 +1369,7 @@ inline static enum cudaMemcpyKind hipMemcpyKindToCudaMemcpyKind(hipMemcpyKind ki
         case hipMemcpyDeviceToHost:
             return cudaMemcpyDeviceToHost;
         case hipMemcpyDeviceToDevice:
+        case hipMemcpyDeviceToDeviceNoCU:
             return cudaMemcpyDeviceToDevice;
         case hipMemcpyDefault:
             return cudaMemcpyDefault;

From ad18883265b2fb8dc433fd4a464513172915caab Mon Sep 17 00:00:00 2001
From: Saleel Kudchadker <Saleel.Kudchadker@amd.com>
Date: Wed, 18 Oct 2023 22:50:49 +0000
Subject: [PATCH 107/177] SWDEV-408180 - Fix NV build

Change-Id: I4647cb8d1c710075c1417a1842b9627f77aa479a
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 218c118682..becf7f63f5 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1369,7 +1369,6 @@ inline static enum cudaMemcpyKind hipMemcpyKindToCudaMemcpyKind(hipMemcpyKind ki
         case hipMemcpyDeviceToHost:
             return cudaMemcpyDeviceToHost;
         case hipMemcpyDeviceToDevice:
-        case hipMemcpyDeviceToDeviceNoCU:
             return cudaMemcpyDeviceToDevice;
         case hipMemcpyDefault:
             return cudaMemcpyDefault;

From 2813f262d3ed387e5a4f710ad90ce512b55ab2f1 Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Fri, 10 Nov 2023 14:58:49 +0000
Subject: [PATCH 108/177] SWDEV-431568 - use typedef struct for
 hipExternalMemoryMipmappedArrayDesc

Change-Id: Iee81c823f9794aa26ee3dd8700d3adb60348db22
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index becf7f63f5..a89e826330 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1483,7 +1483,7 @@ typedef enum cudaExternalMemoryHandleType hipExternalMemoryHandleType;
 typedef struct cudaExternalMemoryHandleDesc hipExternalMemoryHandleDesc;
 typedef struct cudaExternalMemoryBufferDesc hipExternalMemoryBufferDesc;
 typedef cudaExternalMemory_t hipExternalMemory_t;
-typedef cudaExternalMemoryMipmappedArrayDesc hipExternalMemoryMipmappedArrayDesc;
+typedef struct cudaExternalMemoryMipmappedArrayDesc hipExternalMemoryMipmappedArrayDesc;
 
 typedef enum cudaExternalSemaphoreHandleType hipExternalSemaphoreHandleType;
 #define hipExternalSemaphoreHandleTypeOpaqueFd cudaExternalSemaphoreHandleTypeOpaqueFd

From 75ef32dfe9fa714d4716d1b5fd7d537a2e2837e8 Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Fri, 17 Nov 2023 12:09:21 +0000
Subject: [PATCH 109/177] SWDEV-432951 - Fixed C build for nvidia

Change-Id: I61de3da60fd1274cbf8e32f16f4ecfd8c428d5d5
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 142 +++++++++---------
 1 file changed, 71 insertions(+), 71 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a89e826330..eb62087274 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -552,99 +552,99 @@ typedef struct HIP_MEMCPY3D {
   unsigned int Depth;
 } HIP_MEMCPY3D;
 
-static inline void hipMemcpy3DTocudaMemcpy3D(CUDA_MEMCPY3D &a, const HIP_MEMCPY3D* p){
-    a.srcXInBytes = (size_t)p->srcXInBytes;
-    a.srcY = (size_t)p->srcY;
-    a.srcZ = (size_t)p->srcZ;
-    a.srcLOD = (size_t)p->srcLOD;
+static inline void hipMemcpy3DTocudaMemcpy3D(CUDA_MEMCPY3D* a, const HIP_MEMCPY3D* p){
+    a->srcXInBytes = (size_t)p->srcXInBytes;
+    a->srcY = (size_t)p->srcY;
+    a->srcZ = (size_t)p->srcZ;
+    a->srcLOD = (size_t)p->srcLOD;
     switch (p->srcMemoryType) {
         case hipMemoryTypeHost:
-            a.srcMemoryType = CU_MEMORYTYPE_HOST;
+            a->srcMemoryType = CU_MEMORYTYPE_HOST;
             break;
         case hipMemoryTypeDevice:
-            a.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+            a->srcMemoryType = CU_MEMORYTYPE_DEVICE;
             break;
         case hipMemoryTypeArray:
-            a.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+            a->srcMemoryType = CU_MEMORYTYPE_ARRAY;
             break;
         default:
-            a.srcMemoryType = CU_MEMORYTYPE_UNIFIED;
+            a->srcMemoryType = CU_MEMORYTYPE_UNIFIED;
     }
-    a.srcHost = p->srcHost;
-    a.srcDevice =(CUdeviceptr)p->srcDevice;
-    a.srcArray = (CUarray)p->srcArray;
-    a.reserved0 = nullptr;
-    a.srcPitch = (size_t)p->srcPitch;
-    a.srcHeight = (size_t)p->srcHeight;
-    a.dstXInBytes = (size_t)p->dstXInBytes;
-    a.dstY = (size_t)p->dstY;
-    a.dstZ = (size_t)p->dstZ;
-    a.dstLOD = (size_t)p->dstLOD;
+    a->srcHost = p->srcHost;
+    a->srcDevice =(CUdeviceptr)p->srcDevice;
+    a->srcArray = (CUarray)p->srcArray;
+    a->reserved0 = NULL;
+    a->srcPitch = (size_t)p->srcPitch;
+    a->srcHeight = (size_t)p->srcHeight;
+    a->dstXInBytes = (size_t)p->dstXInBytes;
+    a->dstY = (size_t)p->dstY;
+    a->dstZ = (size_t)p->dstZ;
+    a->dstLOD = (size_t)p->dstLOD;
     switch (p->dstMemoryType) {
         case hipMemoryTypeHost:
-            a.dstMemoryType = CU_MEMORYTYPE_HOST;
+            a->dstMemoryType = CU_MEMORYTYPE_HOST;
             break;
         case hipMemoryTypeDevice:
-            a.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+            a->dstMemoryType = CU_MEMORYTYPE_DEVICE;
             break;
         case hipMemoryTypeArray:
-            a.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+            a->dstMemoryType = CU_MEMORYTYPE_ARRAY;
             break;
         default:
-            a.dstMemoryType = CU_MEMORYTYPE_UNIFIED;
+            a->dstMemoryType = CU_MEMORYTYPE_UNIFIED;
     }
-    a.dstHost = p->dstHost;
-    a.dstDevice = (CUdeviceptr)p->dstDevice;
-    a.dstArray = (CUarray)p->dstArray;
-    a.reserved1 = nullptr;
-    a.dstPitch = (size_t)p->dstPitch;
-    a.dstHeight = (size_t)p->dstHeight;
-    a.WidthInBytes = (size_t)p->WidthInBytes;
-    a.Height = (size_t)p->Height;
-    a.Depth = (size_t)p->Depth;
+    a->dstHost = p->dstHost;
+    a->dstDevice = (CUdeviceptr)p->dstDevice;
+    a->dstArray = (CUarray)p->dstArray;
+    a->reserved1 = NULL;
+    a->dstPitch = (size_t)p->dstPitch;
+    a->dstHeight = (size_t)p->dstHeight;
+    a->WidthInBytes = (size_t)p->WidthInBytes;
+    a->Height = (size_t)p->Height;
+    a->Depth = (size_t)p->Depth;
 }
 
-static inline void hipMemcpy2DTocudaMemcpy2D(CUDA_MEMCPY2D &a, const hip_Memcpy2D* p){
-    a.srcXInBytes = (size_t)p->srcXInBytes;
-    a.srcY = (size_t)p->srcY;
+static inline void hipMemcpy2DTocudaMemcpy2D(CUDA_MEMCPY2D* a, const hip_Memcpy2D* p){
+    a->srcXInBytes = (size_t)p->srcXInBytes;
+    a->srcY = (size_t)p->srcY;
     switch (p->srcMemoryType) {
         case hipMemoryTypeHost:
-            a.srcMemoryType = CU_MEMORYTYPE_HOST;
+            a->srcMemoryType = CU_MEMORYTYPE_HOST;
             break;
         case hipMemoryTypeDevice:
-            a.srcMemoryType = CU_MEMORYTYPE_DEVICE;
+            a->srcMemoryType = CU_MEMORYTYPE_DEVICE;
             break;
         case hipMemoryTypeArray:
-            a.srcMemoryType = CU_MEMORYTYPE_ARRAY;
+            a->srcMemoryType = CU_MEMORYTYPE_ARRAY;
             break;
         default:
-            a.srcMemoryType = CU_MEMORYTYPE_UNIFIED;
+            a->srcMemoryType = CU_MEMORYTYPE_UNIFIED;
     }
-    a.srcHost = p->srcHost;
-    a.srcDevice = (CUdeviceptr)p->srcDevice;
-    a.srcArray = (CUarray)p->srcArray;
-    a.srcPitch = (size_t)p->srcPitch;
-    a.dstXInBytes = (size_t)p->dstXInBytes;
-    a.dstY = (size_t)p->dstY;
+    a->srcHost = p->srcHost;
+    a->srcDevice = (CUdeviceptr)p->srcDevice;
+    a->srcArray = (CUarray)p->srcArray;
+    a->srcPitch = (size_t)p->srcPitch;
+    a->dstXInBytes = (size_t)p->dstXInBytes;
+    a->dstY = (size_t)p->dstY;
     switch (p->dstMemoryType) {
         case hipMemoryTypeHost:
-            a.dstMemoryType = CU_MEMORYTYPE_HOST;
+            a->dstMemoryType = CU_MEMORYTYPE_HOST;
             break;
         case hipMemoryTypeDevice:
-            a.dstMemoryType = CU_MEMORYTYPE_DEVICE;
+            a->dstMemoryType = CU_MEMORYTYPE_DEVICE;
             break;
         case hipMemoryTypeArray:
-            a.dstMemoryType = CU_MEMORYTYPE_ARRAY;
+            a->dstMemoryType = CU_MEMORYTYPE_ARRAY;
             break;
         default:
-            a.dstMemoryType = CU_MEMORYTYPE_UNIFIED;
+            a->dstMemoryType = CU_MEMORYTYPE_UNIFIED;
     }
-    a.dstHost = p->dstHost;
-    a.dstDevice = (CUdeviceptr)p->dstDevice;
-    a.dstArray = (CUarray)p->dstArray;
-    a.dstPitch = (size_t)p->dstPitch;
-    a.WidthInBytes = (size_t)p->WidthInBytes;
-    a.Height = (size_t)p->Height;
+    a->dstHost = p->dstHost;
+    a->dstDevice = (CUdeviceptr)p->dstDevice;
+    a->dstArray = (CUarray)p->dstArray;
+    a->dstPitch = (size_t)p->dstPitch;
+    a->WidthInBytes = (size_t)p->WidthInBytes;
+    a->Height = (size_t)p->Height;
 }
 
 
@@ -1976,21 +1976,21 @@ inline static hipMemoryType getHipMemoryType(CUmemorytype type) {
 }
 
 inline static hipError_t hipMemcpyParam2D(const hip_Memcpy2D* pCopy) {
-  if(pCopy == nullptr) {
-    return hipCUResultTohipError(cuMemcpy2D(nullptr));
+  if(pCopy == NULL) {
+    return hipCUResultTohipError(cuMemcpy2D(NULL));
   } else {
     CUDA_MEMCPY2D cudaCopy = {0};
-    hipMemcpy2DTocudaMemcpy2D(cudaCopy, pCopy);
+    hipMemcpy2DTocudaMemcpy2D(&cudaCopy, pCopy);
     return hipCUResultTohipError(cuMemcpy2D((const CUDA_MEMCPY2D*)&cudaCopy));
   }
 }
 
 inline static hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStream_t stream __dparm(0)) {
-  if(pCopy == nullptr) {
-    return hipCUResultTohipError(cuMemcpy2DAsync(nullptr, stream));
+  if(pCopy == NULL) {
+    return hipCUResultTohipError(cuMemcpy2DAsync(NULL, stream));
   } else {
     CUDA_MEMCPY2D cudaCopy = {0};
-    hipMemcpy2DTocudaMemcpy2D(cudaCopy, pCopy);
+    hipMemcpy2DTocudaMemcpy2D(&cudaCopy, pCopy);
     return hipCUResultTohipError(cuMemcpy2DAsync((const CUDA_MEMCPY2D*)&cudaCopy, stream));
   }
 }
@@ -2004,21 +2004,21 @@ inline static hipError_t hipMemcpy3DAsync(const struct hipMemcpy3DParms *p, hipS
 }
 
 inline static hipError_t hipDrvMemcpy3D(const HIP_MEMCPY3D* pcopy) {
-    if(pcopy == nullptr) {
-      return hipCUResultTohipError(cuMemcpy3D(nullptr));
+    if(pcopy == NULL) {
+      return hipCUResultTohipError(cuMemcpy3D(NULL));
     } else {
       CUDA_MEMCPY3D cudaCopy = {0};
-      hipMemcpy3DTocudaMemcpy3D(cudaCopy, pcopy);
+      hipMemcpy3DTocudaMemcpy3D(&cudaCopy, pcopy);
       return hipCUResultTohipError(cuMemcpy3D((const CUDA_MEMCPY3D*)&cudaCopy));
     }
 }
 
 inline static hipError_t hipDrvMemcpy3DAsync(const HIP_MEMCPY3D *pcopy, hipStream_t stream) {
-    if(pcopy == nullptr) {
-      return hipCUResultTohipError(cuMemcpy3DAsync(nullptr, stream));
+    if(pcopy == NULL) {
+      return hipCUResultTohipError(cuMemcpy3DAsync(NULL, stream));
     } else {
       CUDA_MEMCPY3D cudaCopy = {0};
-      hipMemcpy3DTocudaMemcpy3D(cudaCopy, pcopy);
+      hipMemcpy3DTocudaMemcpy3D(&cudaCopy, pcopy);
       return hipCUResultTohipError(cuMemcpy3DAsync((const CUDA_MEMCPY3D*)&cudaCopy, stream));
     }
 }
@@ -2745,8 +2745,8 @@ inline static hipError_t hipPointerGetAttribute(void* data, hipPointer_attribute
     hipError_t err = hipCUResultTohipError(cuPointerGetAttribute(data, attribute, ptr));
     if (err == hipSuccess &&
         attribute == HIP_POINTER_ATTRIBUTE_MEMORY_TYPE &&
-        data != nullptr) {
-        *reinterpret_cast<uint32_t*>(data) = getHipMemoryType(*reinterpret_cast<CUmemorytype*>(data));
+        data != NULL) {
+        *(uint32_t*) data = getHipMemoryType(*(CUmemorytype*) data);
     }
     return err;
 }
@@ -2755,10 +2755,10 @@ inline static hipError_t hipDrvPointerGetAttributes(unsigned int numAttributes,
                                                     hipPointer_attribute* attributes,
                                                     void** data, hipDeviceptr_t ptr) {
     hipError_t err = hipCUResultTohipError(cuPointerGetAttributes(numAttributes, attributes, data, ptr));
-    if (err == hipSuccess && attributes != nullptr) {
+    if (err == hipSuccess && attributes != NULL) {
         for(int i = 0; i < numAttributes; i++) {
           if(attributes[i] == HIP_POINTER_ATTRIBUTE_MEMORY_TYPE) {
-            *reinterpret_cast<uint32_t**>(data)[i] = getHipMemoryType(*reinterpret_cast<CUmemorytype**>(data)[i]);
+            *((uint32_t**) data)[i] = getHipMemoryType(*((CUmemorytype**) data)[i]);
             break;
           }
         }

From ee2893ac63fee91c0b224fbe2eec7565ee7c6ffc Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Mon, 11 Dec 2023 23:31:28 +0000
Subject: [PATCH 110/177] SWDEV-422771 - Add README

Change-Id: I0895d631347208967327df6aaf59bf6936548a9f
---
 LICENSE.txt | 20 ++++++++++++++++++++
 README.md   | 22 ++++++++++++++++++++++
 2 files changed, 42 insertions(+)
 create mode 100644 LICENSE.txt
 create mode 100644 README.md

diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000000..ec6d7bf688
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,20 @@
+Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000..c89c90f61c
--- /dev/null
+++ b/README.md
@@ -0,0 +1,22 @@
+## What is this repository for? ###
+
+This repository provides files required to support non-AMD specific back-end implementation for [HIP](https://github.com/ROCm/HIP).
+
+## DISCLAIMER
+
+The information presented in this document is for informational purposes only and may contain technical inaccuracies, omissions, and typographical errors. The information contained herein is subject to change and may be rendered inaccurate for many reasons, including but not limited to product and roadmap changes, component and motherboard versionchanges, new model and/or product releases, product differences between differing manufacturers, software changes, BIOS flashes, firmware upgrades, or the like. Any computer system has risks of security vulnerabilities that cannot be completely prevented or mitigated.AMD assumes no obligation to update or otherwise correct or revise this information. However, AMD reserves the right to revise this information and to make changes from time to time to the content hereof without obligation of AMD to notify any person of such revisions or changes.THIS INFORMATION IS PROVIDED ‘AS IS.” AMD MAKES NO REPRESENTATIONS OR WARRANTIES WITH RESPECT TO THE CONTENTS HEREOF AND ASSUMES NO RESPONSIBILITY FOR ANY INACCURACIES, ERRORS, OR OMISSIONS THAT MAY APPEAR IN THIS INFORMATION. AMD SPECIFICALLY DISCLAIMS ANY IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR ANY PARTICULAR PURPOSE. IN NO EVENT WILL AMD BE LIABLE TO ANY PERSON FOR ANY RELIANCE, DIRECT, INDIRECT, SPECIAL, OR OTHER CONSEQUENTIAL DAMAGES ARISING FROM THE USE OF ANY INFORMATION CONTAINED HEREIN, EVEN IF AMD IS EXPRESSLY ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. AMD, the AMD Arrow logo, and combinations thereof are trademarks of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies.
+
+©2023 Advanced Micro Devices, Inc. All Rights Reserved.
+
+## Repository branches:
+
+The hipother repository maintains several branches. The branches that are of importance are:
+
+* Develop branch: This is the default branch, on which the new features are still under development and visible. While this maybe of interest to many, it should be noted that this branch and the features under development might not be stable.
+* Release branches. These are branches corresponding to each ROCM release, listed with release tags, such as rocm-6.0, etc.
+
+## Release tagging:
+
+hipother releases are typically naming convention for each ROCM release to help differentiate them.
+
+* rocm x.yy: These are the stable releases based on the ROCM release.

From 8195ac393a4b91a86e616137f4e006ddf0832708 Mon Sep 17 00:00:00 2001
From: Rahul Manocha <rahul.manocha@amd.com>
Date: Tue, 23 Jan 2024 18:15:51 +0000
Subject: [PATCH 111/177] SWDEV-421025 Graph Kernel Node Attr ID/Value for
 Nvidia

Change-Id: Ia4676acbadf30fab4041ac7939c8dc6e3996b4e7
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 23 +++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index eb62087274..391ee8d90c 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1574,6 +1574,20 @@ typedef union cudaKernelNodeAttrValue hipKernelNodeAttrValue;
 typedef enum  cudaKernelNodeAttrID hipKernelNodeAttrID;
 #define hipKernelNodeAttributeAccessPolicyWindow cudaKernelNodeAttributeAccessPolicyWindow
 #define hipKernelNodeAttributeCooperative cudaKernelNodeAttributeCooperative
+#define hipKernelNodeAttributePriority cudaKernelNodeAttributePriority
+
+#if CUDA_VERSION >= CUDA_12000
+typedef enum cudaGraphInstantiateResult hipGraphInstantiateResult;
+#define hipGraphInstantiateSuccess cudaGraphInstantiateSuccess
+#define hipGraphInstantiateError cudaGraphInstantiateError
+#define hipGraphInstantiateInvalidStructure cudaGraphInstantiateInvalidStructure
+#define hipGraphInstantiateNodeOperationNotSupported cudaGraphInstantiateNodeOperationNotSupported
+#define hipGraphInstantiateMultipleDevicesNotSupported \
+                                                     cudaGraphInstantiateMultipleDevicesNotSupported
+
+#define hipGraphInstantiateParams cudaGraphInstantiateParams
+#endif
+
 typedef enum cudaAccessProperty hipAccessProperty;
 #define hipAccessPropertyNormal cudaAccessPropertyNormal
 #define hipAccessPropertyStreaming cudaAccessPropertyStreaming
@@ -3595,6 +3609,15 @@ inline static hipError_t hipGraphInstantiate(hipGraphExec_t* pGraphExec, hipGrap
         cudaGraphInstantiate(pGraphExec, graph, pErrorNode, pLogBuffer, bufferSize));
 }
 
+#if CUDA_VERSION >= CUDA_12000
+inline static hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExec, hipGraph_t graph,
+                                                       hipGraphInstantiateParams *instantiateParams)
+                                                       {
+    return hipCUDAErrorTohipError(cudaGraphInstantiateWithParams(pGraphExec, graph,
+                                                                 instantiateParams));
+}
+#endif
+
 #if CUDA_VERSION >= CUDA_11040
 inline static hipError_t hipGraphInstantiateWithFlags(hipGraphExec_t* pGraphExec, hipGraph_t graph,
                                                       unsigned long long flags) {

From 873a9b389b180f9506a1793ae96ffb6b5b1b4db2 Mon Sep 17 00:00:00 2001
From: David Galiffi <David.Galiffi@amd.com>
Date: Thu, 1 Feb 2024 18:23:13 -0500
Subject: [PATCH 112/177] SWDEV-437287 - Add CODEOWNERS file

Add @gargrahul, @rakesroy, and @mangupta as CODEOWNERS.
This is for GitHub upstream.

Signed-off-by: David Galiffi <David.Galiffi@amd.com>
Change-Id: Ia47a55263657f6f891ee487bed3f13709ee0fd79
---
 CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 CODEOWNERS

diff --git a/CODEOWNERS b/CODEOWNERS
new file mode 100644
index 0000000000..2990acceb2
--- /dev/null
+++ b/CODEOWNERS
@@ -0,0 +1 @@
+* @gargrahul @mangupta @rakesroy

From bfe18b50f58f06ba13a76620b0251b760a56d367 Mon Sep 17 00:00:00 2001
From: Sebastian Luzynski <Sebastian.Luzynski@amd.com>
Date: Mon, 5 Feb 2024 12:08:38 +0000
Subject: [PATCH 113/177] SWDEV-444289 - Map
 hipDeviceAttributeCanUseStreamWaitValue to enum in CUDA

Signed-off-by: Sebastian Luzynski <Sebastian.Luzynski@amd.com>
Change-Id: I174bea3e37abfc6d92dc5f90376442c37370a831
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 391ee8d90c..cc7a5866f0 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2558,6 +2558,9 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
         case hipDeviceAttributeCanUseHostPointerForRegisteredMem:
             cdattr = cudaDevAttrCanUseHostPointerForRegisteredMem;
             break;
+        case hipDeviceAttributeCanUseStreamWaitValue:
+            cdattr = cudaDevAttrReserved92;
+            break;
         case hipDeviceAttributeComputePreemptionSupported:
             cdattr = cudaDevAttrComputePreemptionSupported;
             break;

From bc83af68dd78e4ae98a31eb74e5bb34c6b7ab94a Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Thu, 4 Jan 2024 11:21:48 -0500
Subject: [PATCH 114/177] SWDEV-436720 - Add nv mappings

- for hipDrvGraphAddMemsetNode and hipDrvGraphAddMemcpyNode

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: Ibb276fcab79b63587e6bf36d07ef129d7022ae3d
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index cc7a5866f0..a30360f397 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -709,6 +709,7 @@ typedef struct cudaTextureDesc hipTextureDesc;
 typedef struct cudaResourceViewDesc hipResourceViewDesc;
 typedef CUDA_TEXTURE_DESC HIP_TEXTURE_DESC;
 typedef CUDA_RESOURCE_VIEW_DESC HIP_RESOURCE_VIEW_DESC;
+typedef CUDA_MEMSET_NODE_PARAMS HIP_MEMSET_NODE_PARAMS;
 // adding code for hipmemSharedConfig
 #define hipSharedMemBankSizeDefault cudaSharedMemBankSizeDefault
 #define hipSharedMemBankSizeFourByte cudaSharedMemBankSizeFourByte
@@ -4119,6 +4120,27 @@ inline static hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGr
                                                 unsigned int* isEnabled) {
     return hipCUDAErrorTohipError(cudaGraphNodeGetEnabled(hGraphExec, hNode, isEnabled));
 }
+
+inline static hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                 const hipGraphNode_t* dependencies, size_t numDependencies,
+                                 const HIP_MEMSET_NODE_PARAMS* memsetParams, hipCtx_t ctx) {
+    return hipCUResultTohipError(cuGraphAddMemsetNode(phGraphNode, hGraph, dependencies, numDependencies,
+                                    memsetParams, ctx));
+}
+
+inline static hipError_t hipDrvGraphAddMemcpyNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                    const hipGraphNode_t* dependencies, size_t numDependencies,
+                                    const HIP_MEMCPY3D* copyParams, hipCtx_t ctx) {
+    if(copyParams == NULL) {
+      return hipCUResultTohipError((cuGraphAddMemcpyNode(phGraphNode, hGraph, dependencies,
+                                    numDependencies, NULL, ctx)));
+    } else {
+      CUDA_MEMCPY3D cudaCopy = {0};
+      hipMemcpy3DTocudaMemcpy3D(&cudaCopy, copyParams);
+      return hipCUResultTohipError((cuGraphAddMemcpyNode(phGraphNode, hGraph, dependencies,
+                                    numDependencies, (const CUDA_MEMCPY3D*)&cudaCopy, ctx)));
+    }
+}
 #endif
 #if CUDA_VERSION >= CUDA_11010
 inline static hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t stream) {

From 14a658824acd0792b12de592a8b065218a82fdfd Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Tue, 26 Mar 2024 14:43:15 +0530
Subject: [PATCH 115/177] SWDEV-452914 - Add hipGetFuncBySymbol for NV

Change-Id: I85dba9de6f083a6532f908fa3ceab4305c3bbf30
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a30360f397..287022e951 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3211,6 +3211,12 @@ __HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
 }
 #endif
 
+#if CUDA_VERSION >= CUDA_11010
+inline static hipError_t hipGetFuncBySymbol(hipFunction_t* functionPtr, const void* symbolPtr) {
+    return hipCUDAErrorTohipError(cudaGetFuncBySymbol(functionPtr, symbolPtr));
+}
+#endif
+
 inline static hipError_t hipGetChannelDesc(hipChannelFormatDesc* desc, hipArray_const_t array)
 {
     return hipCUDAErrorTohipError(cudaGetChannelDesc(desc,array));

From 9706861f2a73a1bc9ccb48592c9a207b0baa4287 Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Wed, 27 Mar 2024 09:23:36 +0000
Subject: [PATCH 116/177] SWDEV-453498 - Add mapping for hipGetProcAddress.

Change-Id: I09d88c0843c8deebdb96ca81b451c5282fa5b206
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 287022e951..67500c4858 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3106,7 +3106,13 @@ inline static hipError_t hipModuleGetGlobal(hipDeviceptr_t* dptr, size_t* bytes,
 inline static hipError_t hipModuleLoadData(hipModule_t* module, const void* image) {
     return hipCUResultTohipError(cuModuleLoadData(module, image));
 }
-
+#if CUDA_VERSION >= CUDA_12000
+inline static hipError_t hipGetProcAddress(const char* symbol, void** pfn, int version,
+                                           uint64_t flags, hipDriverProcAddressQueryResult* symbolStatus) {
+    return hipCUResultTohipError(cuGetProcAddress(symbol, pfn, version, flags,
+                                                  (CUdriverProcAddressQueryResult*)symbolStatus));
+}
+#endif
 inline static hipError_t hipModuleLoadDataEx(hipModule_t* module, const void* image,
                                              unsigned int numOptions, hipJitOption* options,
                                              void** optionValues) {

From a63bfacca707e4d40a483fadb797598547db9fcc Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <anusha.godavarthysurya@amd.com>
Date: Wed, 3 Apr 2024 13:45:21 +0000
Subject: [PATCH 117/177] SWDEV-454718 - Add mapping for
 hipStreamBeginCaptureToGraph

Change-Id: I558cdbec3b76a1809e71ce4016cd267e22997a63
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h     | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 67500c4858..355f6c34c5 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -39,6 +39,7 @@ THE SOFTWARE.
 #define CUDA_11040 11040
 #define CUDA_11060 11060
 #define CUDA_12000 12000
+#define CUDA_12030 12030
 
 #ifdef __cplusplus
 extern "C" {
@@ -1525,6 +1526,9 @@ typedef cudaGraph_t hipGraph_t;
 typedef cudaGraphNode_t hipGraphNode_t;
 typedef cudaGraphExec_t hipGraphExec_t;
 typedef cudaUserObject_t hipUserObject_t;
+#if CUDA_VERSION >= CUDA_12030
+typedef cudaGraphEdgeData hipGraphEdgeData;
+#endif
 
 typedef enum cudaGraphNodeType hipGraphNodeType;
 #define hipGraphNodeTypeKernel cudaGraphNodeTypeKernel
@@ -3601,7 +3605,15 @@ inline static hipError_t hipArray3DGetDescriptor(HIP_ARRAY3D_DESCRIPTOR* pArrayD
 inline static hipError_t hipStreamBeginCapture(hipStream_t stream, hipStreamCaptureMode mode) {
     return hipCUDAErrorTohipError(cudaStreamBeginCapture(stream, mode));
 }
-
+#if CUDA_VERSION >= CUDA_12030
+inline static hipError_t hipStreamBeginCaptureToGraph(hipStream_t stream, hipGraph_t graph,
+                                        const hipGraphNode_t *dependencies,
+                                        const hipGraphEdgeData *dependencyData,
+                                        size_t numDependencies, hipStreamCaptureMode mode) {
+    return hipCUDAErrorTohipError(cudaStreamBeginCaptureToGraph(
+        stream, graph, dependencies, dependencyData, numDependencies, mode));
+}
+#endif
 inline static hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph) {
     return hipCUDAErrorTohipError(cudaStreamEndCapture(stream, pGraph));
 }

From e1f71f4a204e746a898fb3d7e916f230130e4968 Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Mon, 15 Apr 2024 14:00:20 -0400
Subject: [PATCH 118/177] SWDEV-449580 - Adding contributing.md in hipother
 repos

Change-Id: I97a951bee5cd2f3fcee85a9d8e0aeecc949d9983
---
 CONTRIBUTING.md | 133 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 CONTRIBUTING.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..9884640b33
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,133 @@
+# Contributing to hipother #
+
+We welcome contributions to the hipother project. Please follow these details to help ensure your contributions will be successfully accepted.
+If you want to contribute to our documentation, refer to {doc}`Contribute to ROCm docs <rocm:contribute/contributing>`.
+
+## Issue Discussion ##
+
+Please use the [GitHub Issue](https://github.com/ROCm/hipother/issues) tab to notify us of issues.
+
+* Use your best judgement for issue creation. If your issue is already listed, upvote the issue and
+  comment or post to provide additional details, such as how you reproduced this issue.
+* If you're not sure if your issue is the same, err on the side of caution and file your issue.
+  You can add a comment to include the issue number (and link) for the similar issue. If we evaluate
+  your issue as being the same as the existing issue, we'll close the duplicate.
+* If your issue doesn't exist, use the issue template to file a new issue.
+  * When filing an issue, be sure to provide as much information as possible, including script output so
+    we can collect information about your configuration. This helps reduce the time required to
+    reproduce your issue.
+  * Check your issue regularly, as we may require additional information to successfully reproduce the
+    issue.
+* You may also open an issue to ask questions to the maintainers about whether a proposed change
+  meets the acceptance criteria, or to discuss an idea pertaining to the library.
+
+## Acceptance Criteria ##
+
+HIPOTHER is a C++ Runtime API interface with CUDA APIs that allows developers to create portable applications for AMD and NVIDIA GPUs from single source code. Contributors wishing to submit new HIP Features (ie functions, classes, types) should also consider CUDA APIs.
+Differences or limitations of HIP APIs as compared to CUDA APIs should be clearly documented and described.
+Some guidelines are outlined below:
+
+### Add a new HIP API ###
+
+- Add a translation to the hipify-clang tool ; many examples abound.
+    - For stat tracking purposes, place the API into an appropriate stat category ("dev", "mem", "stream", etc).
+- Add a inlined NVIDIA implementation for the function in /hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h in the repository [hipother](https://github.com/ROCm/hipother).
+    - These are typically headers
+- Add an HIP definition and Doxygen comments for the function in /include/hip/hip_runtime_api.h, in the repository [hip](https://github.com/ROCm/hip).
+    - Source implementation typically go in clr/hipamd/src/hip_*.cpp in the reposotory [clr](https://github.com/ROCm/clr). The implementation involves calls to HIP runtime (ie for hipStream_t).
+
+### Run Unit Tests ###
+
+For new features or bug fixes, it's mandatory to run associate [hip-tests](https://github.com/ROCm/hip-tests) on both AMD and NVIDIA platforms.
+Please go to the repo and follow the steps.
+
+For applications and benchmarks outside the hip-tests environment, developments should use a two-step development flow:
+- #1. Compile, link, and install HIP. See {ref}`Building the HIP runtime` notes.
+- #2. Relink the target application to include changes in HIP runtime file.
+
+## Code Structure ##
+
+hipother contains mainly header files with interfaces of different typs of HIP APIs to the corresponding CUDA runtime or driver APIs, for example,
+- `hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h` - contains enumerations and HIP runtime API interfaces corresponding to CUDA enumerations and runtime APIs.
+
+- `hipnv/include/hip/nvidia_detail/nvidia_hiprtc.h` - contains HIP runtime compiler enumerations and APIs correspond to CUDA.
+
+
+## Coding Style ##
+- Code Indentation:
+    - Tabs should be expanded to spaces.
+    - Use 4 spaces indentation.
+- Capitalization and Naming
+    - Prefer camelCase for HIP interfaces and internal symbols.  Note HCC uses _ for separator.
+    - Member variables should begin with a leading "_".  This allows them to be easily distinguished from other variables or functions.
+
+- `{}` placement
+    - namespace should be on same line as `{` and separated by a space.
+    - Single-line if statement should still use `{/}` pair (even though C++ does not require).
+    - For functions, the opening `{` should be placed on a new line.
+    - For if/else blocks, the opening `{` is placed on same line as the if/else. Use a space to separate `{` from if/else. For example,
+```console
+    if (foo) {
+        doFoo()
+    } else {
+        doFooElse();
+    }
+```
+
+
+## Pull Request Guidelines ##
+
+By creating a pull request, you agree to the statements made in the code license section. Your pull request should target the default branch. Our current default branch is the develop branch, which serves as our integration branch.
+
+Follow existing best practice for writing a good Git commit message.
+
+Some tips:
+    http://chris.beams.io/posts/git-commit/
+    https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message
+
+In particular :
+   - Use imperative voice, ie "Fix this bug", "Refactor the XYZ routine", "Update the doc".
+     Not : "Fixing the bug", "Fixed the bug", "Bug fix", etc.
+   - Subject should summarize the commit.  Do not end subject with a period.  Use a blank line
+     after the subject.
+
+### Deliverables ###
+
+hipother is part of HIP open source library. Because of this, we include the following license description at the top of every source file.
+If you create new source files in the repository, please include this text in them as well (replacing "xx" with the digits for the current year):
+```
+// Copyright (c) 20xx Advanced Micro Devices, Inc. All rights reserved.
+//
+// Permission is hereby granted, free of charge, to any person obtaining a copy
+// of this software and associated documentation files (the "Software"), to deal
+// in the Software without restriction, including without limitation the rights
+// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the Software is
+// furnished to do so, subject to the following conditions:
+//
+// The above copyright notice and this permission notice shall be included in
+// all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+// THE SOFTWARE.
+```
+
+### Process ###
+
+After you create a PR, you can take a look at a diff of the changes you made using the PR's "Files" tab.
+
+PRs must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged.
+
+Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table
+to view logs associated with a check if it fails.
+
+During code reviews, another developer will take a look through your proposed change. If any modifications are requested (or further discussion about anything is
+needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas.
+When a modification request has been completed, the conversation thread about it will be marked as resolved.
+
+To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request.
\ No newline at end of file

From 87925b8330b22181efdfe49ae4188304bbc43438 Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Mon, 29 Apr 2024 19:03:40 -0400
Subject: [PATCH 119/177] SWDEV-459583 - Update codeownder in hipother repos

Change-Id: Ifbe9254e8773b911c24962486c64ea83e73992c5
---
 CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 2990acceb2..87f9d23e74 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1 @@
-* @gargrahul @mangupta @rakesroy
+* @cpaquot @gandryey @skudchad @mangupta @rakesroy

From 3cd4516b100e13d98580ac73db11788649806679 Mon Sep 17 00:00:00 2001
From: Satyanvesh Dittakavi <Satyanvesh.Dittakavi@amd.com>
Date: Thu, 18 Apr 2024 17:32:32 +0530
Subject: [PATCH 120/177] SWDEV-453527 - Add mapping for
 hipDrvMemcpy2DUnaligned

Change-Id: Ifaad326aa4455b4af2e5f0a26c1d9c7a3e6f6833
---
 .../include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 355f6c34c5..88a699965b 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2014,6 +2014,16 @@ inline static hipError_t hipMemcpyParam2DAsync(const hip_Memcpy2D* pCopy, hipStr
   }
 }
 
+inline static hipError_t hipDrvMemcpy2DUnaligned(const hip_Memcpy2D* pCopy) {
+  if(pCopy == NULL) {
+    return hipCUResultTohipError(cuMemcpy2DUnaligned(NULL));
+  } else {
+    CUDA_MEMCPY2D cudaCopy = {0};
+    hipMemcpy2DTocudaMemcpy2D(&cudaCopy, pCopy);
+    return hipCUResultTohipError(cuMemcpy2DUnaligned((const CUDA_MEMCPY2D*)&cudaCopy));
+  }
+}
+
 inline static hipError_t hipMemcpy3D(const struct hipMemcpy3DParms *p) {
     return hipCUDAErrorTohipError(cudaMemcpy3D(p));
 }

From a3de1bcdd92c93d008d10f88fc695989a017b54b Mon Sep 17 00:00:00 2001
From: Ajay <ajay.gunashekar@amd.com>
Date: Wed, 15 May 2024 16:51:42 -0700
Subject: [PATCH 121/177] SWDEV-460814 - hipDeviceGetAttribute missed
 attributes for Nv platform

Change-Id: I4103aba6b9809c09fae30cada8c47ec1e8f16a35
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 36 +++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 88a699965b..a623890b7b 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2582,6 +2582,42 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
         case hipDeviceAttributeHostNativeAtomicSupported:
             cdattr = cudaDevAttrHostNativeAtomicSupported;
             break;
+        case hipDeviceAttributeDeviceOverlap:
+            cdattr = cudaDevAttrGpuOverlap;
+            break;
+        case hipDeviceAttributeLocalL1CacheSupported:
+            cdattr = cudaDevAttrLocalL1CacheSupported;
+            break;
+        case hipDeviceAttributeMaxSurface1D:
+            cdattr = cudaDevAttrMaxSurface1DWidth;
+            break;
+        case hipDeviceAttributeMaxTexture1DLinear:
+            cdattr = cudaDevAttrMaxTexture1DLinearWidth;
+            break;
+        case hipDeviceAttributeMaxTexture1DMipmap:
+            cdattr = cudaDevAttrMaxTexture1DMipmappedWidth;
+            break;
+        case hipDeviceAttributeMaxTextureCubemap:
+            cdattr = cudaDevAttrMaxTextureCubemapWidth;
+            break;
+        case hipDeviceAttributePciDomainID:
+            cdattr = cudaDevAttrPciDomainId;
+            break;
+        case hipDeviceAttributePersistingL2CacheMaxSize:
+            cdattr = cudaDevAttrMaxPersistingL2CacheSize;
+            break;
+        case hipDeviceAttributeMaxRegistersPerMultiprocessor:
+            cdattr = cudaDevAttrMaxRegistersPerMultiprocessor;
+            break;
+        case hipDeviceAttributeSharedMemPerBlockOptin:
+            cdattr = cudaDevAttrMaxSharedMemoryPerBlockOptin;
+            break;
+        case hipDeviceAttributeSharedMemPerMultiprocessor:
+            cdattr = cudaDevAttrMaxSharedMemoryPerMultiprocessor;
+            break;
+        case hipDeviceAttributeMemoryPoolSupportedHandleTypes:
+            cdattr = cudaDevAttrMemoryPoolSupportedHandleTypes;
+            break;
         default:
             return hipCUDAErrorTohipError(cudaErrorInvalidValue);
     }

From 19390bc9cae4244ac6a786dba2bca565f044cdd5 Mon Sep 17 00:00:00 2001
From: Anusha GodavarthySurya <anusha.godavarthysurya@amd.com>
Date: Wed, 12 Jun 2024 09:04:36 +0000
Subject: [PATCH 122/177] SWDEV-466037 - Add nvidia mapping for hip APIs

Change-Id: If8ef48715a13720454f61f693debe51bd15a4edc
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a623890b7b..36949e1399 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -4217,6 +4217,40 @@ inline static hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t st
     return hipCUDAErrorTohipError(cudaGraphUpload(graphExec, stream));
 }
 #endif
+inline static hipError_t hipMemcpyAtoD(hipDeviceptr_t dstDevice, hipArray_t srcArray,
+                                       size_t srcOffset, size_t ByteCount) {
+    return hipCUResultTohipError(cuMemcpyAtoD(dstDevice, (CUarray)srcArray, srcOffset, ByteCount));
+}
+inline static hipError_t hipMemcpyDtoA(hipArray_t dstArray, size_t dstOffset,
+                                       hipDeviceptr_t srcDevice, size_t ByteCount) {
+    return hipCUResultTohipError(cuMemcpyDtoA((CUarray)dstArray, dstOffset, srcDevice, ByteCount));
+}
+inline static hipError_t hipMemcpyAtoA(hipArray_t dstArray, size_t dstOffset, hipArray_t srcArray,
+                                       size_t srcOffset, size_t ByteCount) {
+    return hipCUResultTohipError(
+      cuMemcpyAtoA((CUarray)dstArray, dstOffset, (CUarray)srcArray, srcOffset, ByteCount));
+}
+inline static hipError_t hipMemcpyAtoHAsync(void* dstHost, hipArray_t srcArray, size_t srcOffset,
+                                            size_t ByteCount, hipStream_t stream) {
+    return hipCUResultTohipError(
+      cuMemcpyAtoHAsync(dstHost, (CUarray)srcArray, srcOffset, ByteCount, stream));
+}
+inline static hipError_t hipMemcpyHtoAAsync(hipArray_t dstArray, size_t dstOffset,
+                                            const void* srcHost, size_t ByteCount,
+                                            hipStream_t stream) {
+    return hipCUResultTohipError(
+      cuMemcpyHtoAAsync((CUarray)dstArray, dstOffset, srcHost, ByteCount, stream));
+}
+inline static hipError_t hipMemcpy2DArrayToArray(hipArray_t dst, size_t wOffsetDst,
+                                                 size_t hOffsetDst, hipArray_const_t src,
+                                                 size_t wOffsetSrc, size_t hOffsetSrc, size_t width,
+                                                 size_t height, hipMemcpyKind kind) {
+    return hipCUDAErrorTohipError(cudaMemcpy2DArrayToArray(
+      dst, wOffsetDst, hOffsetDst, src, wOffsetSrc, hOffsetSrc, width, height, kind));
+}
+inline static hipError_t hipSetValidDevices(int* device_arr, int len) {
+    return hipCUDAErrorTohipError(cudaSetValidDevices(device_arr, len));
+}
 #endif  //__CUDACC__
 
 #endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_RUNTIME_API_H

From 1287d7e4a652ba552e392330e65adf1697e0d849 Mon Sep 17 00:00:00 2001
From: sdashmiz <shadi.dashmiz@amd.com>
Date: Mon, 22 Apr 2024 11:13:15 -0400
Subject: [PATCH 123/177] SWDEV-458069 - Fix return of local variable

Signed-off-by: sdashmiz <shadi.dashmiz@amd.com>
Change-Id: I8964f988383f61e35363490b669a416341150979
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 40 +++++++++----------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 36949e1399..cafe4e41f9 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -478,34 +478,32 @@ typedef struct HIP_RESOURCE_DESC_st
     unsigned int flags;                          /**< Flags (must be zero) */
 } HIP_RESOURCE_DESC;
 
-static inline CUDA_RESOURCE_DESC* hipResourceDesTocudaResourceDes(const HIP_RESOURCE_DESC* p){
-    CUDA_RESOURCE_DESC a;
+static inline void hipResourceDesTocudaResourceDes(CUDA_RESOURCE_DESC* a, const HIP_RESOURCE_DESC* p){
     switch (p->resType) {
         case HIP_RESOURCE_TYPE_ARRAY:
-            a.resType = CU_RESOURCE_TYPE_ARRAY;
+            a->resType = CU_RESOURCE_TYPE_ARRAY;
         case HIP_RESOURCE_TYPE_MIPMAPPED_ARRAY:
-            a.resType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
+            a->resType = CU_RESOURCE_TYPE_MIPMAPPED_ARRAY;
         case HIP_RESOURCE_TYPE_LINEAR:
-            a.resType = CU_RESOURCE_TYPE_LINEAR;
+            a->resType = CU_RESOURCE_TYPE_LINEAR;
         case HIP_RESOURCE_TYPE_PITCH2D:
-            a.resType = CU_RESOURCE_TYPE_PITCH2D;
+            a->resType = CU_RESOURCE_TYPE_PITCH2D;
         default:
-            a.resType = CU_RESOURCE_TYPE_ARRAY;
+            a->resType = CU_RESOURCE_TYPE_ARRAY;
     }
-    a.res.array.hArray = (CUarray)p->res.array.hArray;
-    a.res.mipmap.hMipmappedArray = (CUmipmappedArray)p->res.mipmap.hMipmappedArray;
-    a.res.linear.devPtr = p->res.linear.devPtr;
-    a.res.linear.format = p->res.linear.format;
-    a.res.linear.numChannels = p->res.linear.numChannels;
-    a.res.linear.sizeInBytes = p->res.linear.sizeInBytes;
-    a.res.pitch2D.devPtr = p->res.pitch2D.devPtr;
-    a.res.pitch2D.numChannels = p->res.pitch2D.numChannels;
-    a.res.pitch2D.format = p->res.pitch2D.format;
-    a.res.pitch2D.width = p->res.pitch2D.width;
-    a.res.pitch2D.height = p->res.pitch2D.height;
-    a.res.pitch2D.pitchInBytes = p->res.pitch2D.pitchInBytes;
-    a.flags = p->flags;
-    return &a;
+    a->res.array.hArray = (CUarray)p->res.array.hArray;
+    a->res.mipmap.hMipmappedArray = (CUmipmappedArray)p->res.mipmap.hMipmappedArray;
+    a->res.linear.devPtr = p->res.linear.devPtr;
+    a->res.linear.format = p->res.linear.format;
+    a->res.linear.numChannels = p->res.linear.numChannels;
+    a->res.linear.sizeInBytes = p->res.linear.sizeInBytes;
+    a->res.pitch2D.devPtr = p->res.pitch2D.devPtr;
+    a->res.pitch2D.numChannels = p->res.pitch2D.numChannels;
+    a->res.pitch2D.format = p->res.pitch2D.format;
+    a->res.pitch2D.width = p->res.pitch2D.width;
+    a->res.pitch2D.height = p->res.pitch2D.height;
+    a->res.pitch2D.pitchInBytes = p->res.pitch2D.pitchInBytes;
+    a->flags = p->flags;
 }
 
 typedef struct hip_Memcpy2D {

From 0ba07a685e26854dc173dbf2b5738a5f789c495d Mon Sep 17 00:00:00 2001
From: Jatin Chaudhary <JatinJaikishan.Chaudhary@amd.com>
Date: Mon, 24 Jun 2024 14:35:23 +0100
Subject: [PATCH 124/177] SWDEV-466747 - Add alias of HIPRT macro in CUDA

Change-Id: I9bc6ba3b05c5e45d075446c53511e079f5d0fbcb
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_bf16.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_bf16.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_bf16.h
index 118996af1d..163007facf 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_bf16.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_bf16.h
@@ -20,13 +20,20 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-
 #ifndef HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_FP16_H
 #define HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_FP16_H
 
+#define HIPRT_ONE_BF16 CUDART_ONE_BF16
+#define HIPRT_ZERO_BF16 CUDART_ZERO_BF16
+#define HIPRT_INF_BF16 CUDART_INF_BF16
+#define HIPRT_MAX_NORMAL_BF16 CUDART_MAX_NORMAL_BF16
+#define HIPRT_MIN_DENORM_BF16 CUDART_MIN_DENORM_BF16
+#define HIPRT_NAN_BF16 CUDART_NAN_BF16
+#define HIPRT_NEG_ZERO_BF16 CUDART_NEG_ZERO_BF16
+
 #include <cuda_bf16.h>
 
 typedef struct __nv_bfloat16 __hip_bfloat16;
 typedef struct __nv_bfloat162 __hip_bfloat162;
 
-#endif  // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_FP16_H
+#endif // HIP_INCLUDE_HIP_NVIDIA_DETAIL_HIP_FP16_H

From ca1fc948abc20c7389d900069d081afdc230bc68 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Mon, 15 Jul 2024 10:31:10 +0000
Subject: [PATCH 125/177] SWDEV-472433 - Update year in license

Change-Id: I19d6642389c0e88d766a0e9d095363b7e2d7c1af
---
 LICENSE.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENSE.txt b/LICENSE.txt
index ec6d7bf688..797310b44b 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,4 @@
-Copyright (c) 2008 - 2023 Advanced Micro Devices, Inc.
+Copyright (c) 2008 - 2024 Advanced Micro Devices, Inc.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

From 0e4a9d8882493abeb251ca801b8de21963dafe32 Mon Sep 17 00:00:00 2001
From: Maneesh Gupta <maneesh.gupta@amd.com>
Date: Wed, 24 Jul 2024 08:20:37 +0000
Subject: [PATCH 126/177] SWDEV-459583 - Fix codeowners file

Change-Id: Ib03328a7fb13375fa44626a40202b1eeb177b8b5
---
 CODEOWNERS | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 87f9d23e74..917a1d8d9b 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1 @@
-* @cpaquot @gandryey @skudchad @mangupta @rakesroy
+* @chrispaquot @gandryey @saleelk @mangupta @rakesroy

From ccd569c4bb0e78ad830d8e8e9cf3048290a1e0a3 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Wed, 31 Jul 2024 15:21:46 +0200
Subject: [PATCH 127/177] SWDEV-476869 - Add mapping to cudaGetTextureReference

Change-Id: I85a6e4423d7b8b88ed18629225c583b87551414f
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index cafe4e41f9..d923d98d83 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3259,6 +3259,11 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe
 }
 
 #if CUDA_VERSION < CUDA_12000
+__HIP_DEPRECATED inline static hipError_t hipGetTextureReference(const textureReference** texref,
+                                                                 const void* symbol) {
+    return hipCUDAErrorTohipError(cudaGetTextureReference(texref, symbol));
+}
+
 __HIP_DEPRECATED inline static hipError_t hipGetTextureAlignmentOffset(
     size_t* offset, const struct textureReference* texref) {
     return hipCUDAErrorTohipError(cudaGetTextureAlignmentOffset(offset,texref));

From fe927012cbb2d2048ac7ba00ab9a57d51e0e1d54 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Thu, 1 Aug 2024 13:19:11 +0200
Subject: [PATCH 128/177] SWDEV-477086 - Add mapping to cuTexRefGetArray

Change-Id: I3a1b96a2a0c8d4281cdc934aa582a86f24d80b28
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index d923d98d83..49065b4205 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3595,6 +3595,10 @@ inline static hipError_t hipTexObjectGetTextureDesc(HIP_TEXTURE_DESC* pTexDesc,
     return hipCUResultTohipError(cuTexObjectGetTextureDesc(pTexDesc, (CUtexObject)texObject));
 }
 
+__HIP_DEPRECATED inline static hipError_t hipTexRefGetArray(hipArray_t* pArray, hipTexRef texRef) {
+    return hipCUResultTohipError(cuTexRefGetArray((CUarray*)pArray, texRef));
+}
+
 __HIP_DEPRECATED inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipAddress_mode am){
     return hipCUResultTohipError(cuTexRefSetAddressMode(hTexRef,dim,am));
 }

From ba9c7c3f212bd41fa5abad005085fab8e36a4d61 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Wed, 31 Jul 2024 12:06:27 +0200
Subject: [PATCH 129/177] SWDEV-476823 - Add mapping to cuTexRefGetAddress

Change-Id: I51fbcf704ae585d13a08537eaae30243b4c52d49
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 49065b4205..ad73b6911d 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3607,6 +3607,10 @@ __HIP_DEPRECATED inline static hipError_t hipTexRefSetFilterMode(hipTexRef hTexR
     return hipCUResultTohipError(cuTexRefSetFilterMode(hTexRef,fm));
 }
 
+inline static hipError_t hipTexRefGetAddress(hipDeviceptr_t* dev_ptr, hipTexRef texRef) {
+    return hipCUResultTohipError(cuTexRefGetAddress(dev_ptr, texRef));
+}
+
 inline static hipError_t hipTexRefSetAddress(size_t *ByteOffset, hipTexRef hTexRef, hipDeviceptr_t dptr, size_t bytes){
     return hipCUResultTohipError(cuTexRefSetAddress(ByteOffset,hTexRef,dptr,bytes));
 }

From eb16d72255359812eee0ed3b6231ecfbd8bc2134 Mon Sep 17 00:00:00 2001
From: Vladana Stojiljkovic <Vladana.Stojiljkovic@amd.com>
Date: Wed, 26 Jun 2024 13:47:43 +0200
Subject: [PATCH 130/177] SWDEV-432785 - Remove the mapping of __shfl* to
 __shfl*_sync functions on CUDA

Change-Id: I93d2ce360b834b3da01731cabc22841f7d0a092d
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index ad73b6911d..6abfef43bd 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -752,13 +752,6 @@ typedef enum cudaGraphInstantiateFlags hipGraphInstantiateFlags;
 #define hipGraphInstantiateFlagDeviceLaunch cudaGraphInstantiateFlagDeviceLaunch
 #define hipGraphInstantiateFlagUseNodePriority cudaGraphInstantiateFlagUseNodePriority
 
-#if CUDA_VERSION >= CUDA_9000
-#define __shfl(...)      __shfl_sync(0xffffffff, __VA_ARGS__)
-#define __shfl_up(...)   __shfl_up_sync(0xffffffff, __VA_ARGS__)
-#define __shfl_down(...) __shfl_down_sync(0xffffffff, __VA_ARGS__)
-#define __shfl_xor(...)  __shfl_xor_sync(0xffffffff, __VA_ARGS__)
-#endif // CUDA_VERSION >= CUDA_9000
-
 inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
     switch (cuError) {
         case cudaSuccess:

From 4f4b13e498a1f51f76b9a631954d085efaa7a593 Mon Sep 17 00:00:00 2001
From: Vladana Stojiljkovic <Vladana.Stojiljkovic@amd.com>
Date: Tue, 6 Aug 2024 11:21:36 +0200
Subject: [PATCH 131/177] SWDEV-477711 - Add mapping for
 hipTexrefGetAddressMode

Change-Id: I8c721921760f0f09f2d93879eeeb97f457da6c58
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h       | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 6abfef43bd..ee90474465 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -190,11 +190,11 @@ inline static CUarray_format hipArray_FormatToCUarray_format(
 #define HIP_TR_ADDRESS_MODE_MIRROR CU_TR_ADDRESS_MODE_MIRROR
 #define HIP_TR_ADDRESS_MODE_BORDER CU_TR_ADDRESS_MODE_BORDER
 
-// hipAddress_mode
-#define hipAddress_mode CUaddress_mode
+// HIPAddress_mode
+#define HIPaddress_mode CUaddress_mode
 
 inline static CUaddress_mode hipAddress_modeToCUaddress_mode(
-    hipAddress_mode mode) {
+    HIPaddress_mode mode) {
     switch (mode) {
         case HIP_TR_ADDRESS_MODE_WRAP:
             return CU_TR_ADDRESS_MODE_WRAP;
@@ -3592,7 +3592,11 @@ __HIP_DEPRECATED inline static hipError_t hipTexRefGetArray(hipArray_t* pArray,
     return hipCUResultTohipError(cuTexRefGetArray((CUarray*)pArray, texRef));
 }
 
-__HIP_DEPRECATED inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, hipAddress_mode am){
+__HIP_DEPRECATED inline static hipError_t hipTexRefGetAddressMode(HIPaddress_mode *pam, hipTexRef hTexRef, int dim){
+    return hipCUResultTohipError(cuTexRefGetAddressMode(pam, hTexRef, dim));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipTexRefSetAddressMode(hipTexRef hTexRef, int dim, HIPaddress_mode am){
     return hipCUResultTohipError(cuTexRefSetAddressMode(hTexRef,dim,am));
 }
 

From cc4fdd8d9700379e6504ecea9eb0e1486155f7e3 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Fri, 9 Aug 2024 14:04:56 +0200
Subject: [PATCH 132/177] SWDEV-476869 - build fix for cuda version < 12000

Change-Id: I9086ffc0e5e0c1e9748163c8433c9e78c83d8f72
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index ee90474465..a4b645ae8c 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3252,7 +3252,7 @@ inline static hipError_t hipGetTextureObjectResourceDesc(hipResourceDesc* pResDe
 }
 
 #if CUDA_VERSION < CUDA_12000
-__HIP_DEPRECATED inline static hipError_t hipGetTextureReference(const textureReference** texref,
+__HIP_DEPRECATED inline static hipError_t hipGetTextureReference(const struct textureReference** texref,
                                                                  const void* symbol) {
     return hipCUDAErrorTohipError(cudaGetTextureReference(texref, symbol));
 }

From f9d31c596e6077c937985602d90684e49b16bfc2 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Wed, 7 Aug 2024 17:43:28 +0200
Subject: [PATCH 133/177] SWDEV-477711 - Map hipTexRefGetFormat to
 cuTexRefGetFormat

Change-Id: I21297fab6b5e420f2de5f24f9d1c7545e3e6af09
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index a4b645ae8c..e7148abe55 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3616,6 +3616,10 @@ inline static hipError_t hipTexRefSetAddress2D(hipTexRef hTexRef, const CUDA_ARR
     return hipCUResultTohipError(cuTexRefSetAddress2D(hTexRef,desc,dptr,Pitch));
 }
 
+__HIP_DEPRECATED inline static hipError_t hipTexRefGetFormat(hipArray_Format *fmt, int *NumPackedComponents, hipTexRef hTexRef){
+    return hipCUResultTohipError(cuTexRefGetFormat(fmt, NumPackedComponents, hTexRef));
+}
+
 __HIP_DEPRECATED inline static hipError_t hipTexRefSetFormat(hipTexRef hTexRef, hipArray_Format fmt, int NumPackedComponents){
     return hipCUResultTohipError(cuTexRefSetFormat(hTexRef,fmt,NumPackedComponents));
 }

From 4d0e2fe2245c790a169c76882027fb0c380fd1a5 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Tue, 6 Aug 2024 10:32:31 +0200
Subject: [PATCH 134/177] SWDEV-477699 - Add mapping to cuTexRefGetBorderColor
 and cuTexRefSetBorderColor

Change-Id: Id9e6874c5d075ddfcdb617aa7294bc40d2277a44
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index e7148abe55..f326fb5ade 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3604,6 +3604,14 @@ __HIP_DEPRECATED inline static hipError_t hipTexRefSetFilterMode(hipTexRef hTexR
     return hipCUResultTohipError(cuTexRefSetFilterMode(hTexRef,fm));
 }
 
+__HIP_DEPRECATED inline static hipError_t hipTexRefGetBorderColor(float* pBorderColor, hipTexRef hTexRef){
+    return hipCUResultTohipError(cuTexRefGetBorderColor(pBorderColor, hTexRef));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipTexRefSetBorderColor(hipTexRef hTexRef, float* pBorderColor){
+    return hipCUResultTohipError(cuTexRefSetBorderColor(hTexRef, pBorderColor));
+}
+
 inline static hipError_t hipTexRefGetAddress(hipDeviceptr_t* dev_ptr, hipTexRef texRef) {
     return hipCUResultTohipError(cuTexRefGetAddress(dev_ptr, texRef));
 }

From a44a75a56d25589e8fd527914bd531b8cad6769a Mon Sep 17 00:00:00 2001
From: amd-jmacaran <Joseph.Macaranas@amd.com>
Date: Thu, 20 Jun 2024 03:40:25 -0400
Subject: [PATCH 135/177] Enable external CI pipeline triggers

Change-Id: Ifcde2c489e5057b6bc37b02156b6232c299e4808
---
 .azuredevops/rocm-ci.yml | 50 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 .azuredevops/rocm-ci.yml

diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
new file mode 100644
index 0000000000..4614fdcdf1
--- /dev/null
+++ b/.azuredevops/rocm-ci.yml
@@ -0,0 +1,50 @@
+resources:
+  repositories:
+  - repository: pipelines_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/ROCm
+  - repository: matching_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/clr
+    ref: amd-staging
+  - repository: hipother_repo
+    type: github
+    endpoint: ROCm
+    name: ROCm/hip # leverage HIP job that builds both AMD and NV backends
+    ref: amd-staging
+
+variables:
+- group: common
+- template: /.azuredevops/variables-global.yml@pipelines_repo
+
+trigger:
+  batch: true
+  branches:
+    include:
+    - amd-staging
+    - amd-mainline
+  paths:
+    exclude:
+    - '.github'
+    - CODEOWNERS
+    - LICENSE.txt
+    - '**/*.md'
+
+pr:
+  autoCancel: true
+  branches:
+    include:
+    - amd-staging
+    - amd-mainline
+  paths:
+    exclude:
+    - '.github'
+    - CODEOWNERS
+    - LICENSE.txt
+    - '**/*.md'
+  drafts: false
+
+jobs:
+  - template: ${{ variables.CI_COMPONENT_PATH }}/HIP.yml@pipelines_repo

From 58ec72277f908451a21b476b8d8b19d1f290cc5c Mon Sep 17 00:00:00 2001
From: Vladana Stojiljkovic <Vladana.Stojiljkovic@amd.com>
Date: Wed, 7 Aug 2024 12:16:17 +0200
Subject: [PATCH 136/177] SWDEV-477711 - Add mapping for hipTexRefGetFlags

Change-Id: I008ab15256bde3786d59f2bda4b34a3336dc5edf
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index f326fb5ade..1f0f893408 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3632,6 +3632,10 @@ __HIP_DEPRECATED inline static hipError_t hipTexRefSetFormat(hipTexRef hTexRef,
     return hipCUResultTohipError(cuTexRefSetFormat(hTexRef,fmt,NumPackedComponents));
 }
 
+__HIP_DEPRECATED inline static hipError_t hipTexRefGetFlags(unsigned int *pFlags, hipTexRef hTexRef){
+    return hipCUResultTohipError(cuTexRefGetFlags(pFlags, hTexRef));
+}
+
 __HIP_DEPRECATED inline static hipError_t hipTexRefSetFlags(hipTexRef hTexRef, unsigned int Flags){
     return hipCUResultTohipError(cuTexRefSetFlags(hTexRef,Flags));
 }

From 7b46a48be37d64e1d983d7318e0b7c4236deea25 Mon Sep 17 00:00:00 2001
From: Vladana Stojiljkovic <Vladana.Stojiljkovic@amd.com>
Date: Thu, 8 Aug 2024 12:56:31 +0200
Subject: [PATCH 137/177] SWDEV-477711 - Add mappings for
 hipTexRefSetMaxAnisotropy and hipTexRefGetMaxAnisotropy

Change-Id: Ifab9ec542edf97c314d1fb47cd14461089902db4
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 1f0f893408..ec3e8e1e0a 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3644,6 +3644,14 @@ __HIP_DEPRECATED inline static hipError_t hipTexRefSetArray(hipTexRef hTexRef, h
     return hipCUResultTohipError(cuTexRefSetArray(hTexRef,(CUarray)hArray,Flags));
 }
 
+__HIP_DEPRECATED inline static hipError_t hipTexRefGetMaxAnisotropy(int* pmaxAniso, hipTexRef hTexRef) {
+    return hipCUResultTohipError(cuTexRefGetMaxAnisotropy(pmaxAniso, hTexRef));
+}
+
+__HIP_DEPRECATED inline static hipError_t hipTexRefSetMaxAnisotropy(hipTexRef hTexRef, unsigned int maxAniso) {
+    return hipCUResultTohipError(cuTexRefSetMaxAnisotropy(hTexRef, maxAniso));
+}
+
 inline static hipError_t hipArrayCreate(hipArray_t* pHandle, const HIP_ARRAY_DESCRIPTOR* pAllocateArray){
     return hipCUResultTohipError(cuArrayCreate((CUarray*)pHandle, pAllocateArray));
 }

From 6222fa632d046eeb4415bb97e8d05fe02cf14ba0 Mon Sep 17 00:00:00 2001
From: Joseph Macaranas <joseph.macaranas@amd.com>
Date: Thu, 15 Aug 2024 17:25:21 -0400
Subject: [PATCH 138/177] SWDEV-458516 - External CI: Pipeline case sensitivity
 fix

Change-Id: I6501cce7847b4f8bfaa435b143369362d2b8c249
---
 .azuredevops/rocm-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
index 4614fdcdf1..41901e3f01 100644
--- a/.azuredevops/rocm-ci.yml
+++ b/.azuredevops/rocm-ci.yml
@@ -12,7 +12,7 @@ resources:
   - repository: hipother_repo
     type: github
     endpoint: ROCm
-    name: ROCm/hip # leverage HIP job that builds both AMD and NV backends
+    name: ROCm/HIP # leverage HIP job that builds both AMD and NV backends
     ref: amd-staging
 
 variables:

From 335bb33f1b1dfd89d5d498615f148723799325a9 Mon Sep 17 00:00:00 2001
From: Ioannis Assiouras <Ioannis.Assiouras@amd.com>
Date: Mon, 12 Aug 2024 20:57:04 +0100
Subject: [PATCH 139/177] SWDEV-470372 - Un-deprecate hipHostAlloc, comply with
 cuda and introduce hipHostAlloc flags

Change-Id: I0c6910306ff98d97e9f8a6ccf467be6189e1ca86
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index ec3e8e1e0a..8ef1f234da 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -369,6 +369,11 @@ typedef enum cudaResourceViewFormat hipResourceViewFormat;
 #define hipHostMallocCoherent 0x0
 #define hipHostMallocNonCoherent 0x0
 
+#define hipHostAllocDefault cudaHostAllocDefault
+#define hipHostAllocPortable cudaHostAllocPortable
+#define hipHostAllocMapped cudaHostAllocMapped
+#define hipHostAllocWriteCombined cudaHostAllocWriteCombined
+
 #define hipMemAttachGlobal cudaMemAttachGlobal
 #define hipMemAttachHost cudaMemAttachHost
 #define hipMemAttachSingle cudaMemAttachSingle
@@ -1732,7 +1737,6 @@ inline static hipError_t hipMemAllocHost(void** ptr, size_t size) {
     return hipCUResultTohipError(cuMemAllocHost(ptr, size));
 }
 
-__HIP_DEPRECATED_MSG("use hipHostMalloc instead")
 inline static hipError_t hipHostAlloc(void** ptr, size_t size, unsigned int flags) {
     return hipCUDAErrorTohipError(cudaHostAlloc(ptr, size, flags));
 }
@@ -1840,7 +1844,6 @@ inline static hipError_t hipHostUnregister(void* ptr) {
     return hipCUDAErrorTohipError(cudaHostUnregister(ptr));
 }
 
-__HIP_DEPRECATED_MSG("use hipHostFree instead")
 inline static hipError_t hipFreeHost(void* ptr) {
     return hipCUDAErrorTohipError(cudaFreeHost(ptr));
 }

From 9ea767377e511689b23ef476a051de3359ad8b56 Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Tue, 13 Aug 2024 16:43:28 +0000
Subject: [PATCH 140/177] SWDEV-479043 - Add mapping for hipStreamLegacy.

Change-Id: I35f699641624b5ef15e5bc1b3c669a2e74547453
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 8ef1f234da..73ffedcf8e 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -255,6 +255,9 @@ inline static CUresourcetype hipResourcetype_enumToCUresourcetype(
 // hipStreamPerThread
 #define hipStreamPerThread ((cudaStream_t)2)
 
+// hipStreamLegacy
+#define hipStreamLegacy ((cudaStream_t)1)
+
 #define hipTexRef CUtexref
 typedef CUmipmappedArray hipmipmappedArray;
 typedef cudaMipmappedArray_t hipMipmappedArray_t;

From 1a12bbf19cda34c54984ddd3b9005b6fb34f33d4 Mon Sep 17 00:00:00 2001
From: Vladana Stojiljkovic <Vladana.Stojiljkovic@amd.com>
Date: Fri, 26 Jul 2024 12:33:26 +0200
Subject: [PATCH 141/177] SWDEV-475987 - Add mapping for missing CUDA error
 codes

Change-Id: I4cc5cb786fa4c7cc67cc037b00edad63d811ca73
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 73ffedcf8e..90284f90ee 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -904,6 +904,10 @@ inline static hipError_t hipCUDAErrorTohipError(cudaError_t cuError) {
             return hipErrorStreamCaptureWrongThread;
         case cudaErrorGraphExecUpdateFailure:
             return hipErrorGraphExecUpdateFailure;
+        case cudaErrorInvalidChannelDescriptor:
+            return hipErrorInvalidChannelDescriptor;
+        case cudaErrorInvalidTexture:
+            return hipErrorInvalidTexture;
         case cudaErrorUnknown:
         default:
             return hipErrorUnknown;  // Note - translated error.
@@ -1350,6 +1354,10 @@ inline static cudaError_t hipErrorToCudaError(hipError_t hError) {
             return cudaErrorGraphExecUpdateFailure;
         case hipErrorNotSupported:
             return cudaErrorNotSupported;
+        case hipErrorInvalidChannelDescriptor:
+            return cudaErrorInvalidChannelDescriptor;
+        case hipErrorInvalidTexture:
+            return cudaErrorInvalidTexture;
         // HSA: does not exist in CUDA
         case hipErrorRuntimeMemory:
         // HSA: does not exist in CUDA

From c944fb942609ec1dec0eb04e53890024f4f372d5 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Mon, 9 Sep 2024 13:42:15 +0200
Subject: [PATCH 142/177] SWDEV-483312 - Add graph API's interfaces

Change-Id: I83afcc2ba2df3f4dc444c4dcde4e5bf2edefdd9a
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 78 +++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 90284f90ee..20d761d058 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -39,6 +39,7 @@ THE SOFTWARE.
 #define CUDA_11040 11040
 #define CUDA_11060 11060
 #define CUDA_12000 12000
+#define CUDA_12020 12020
 #define CUDA_12030 12030
 
 #ifdef __cplusplus
@@ -1513,6 +1514,8 @@ typedef enum cudaExternalSemaphoreHandleType hipExternalSemaphoreHandleType;
 typedef struct cudaExternalSemaphoreHandleDesc hipExternalSemaphoreHandleDesc;
 typedef cudaExternalSemaphore_t hipExternalSemaphore_t;
 typedef struct cudaExternalSemaphoreSignalParams hipExternalSemaphoreSignalParams;
+typedef struct cudaExternalSemaphoreSignalNodeParams hipExternalSemaphoreSignalNodeParams;
+typedef struct cudaExternalSemaphoreWaitNodeParams hipExternalSemaphoreWaitNodeParams;
 typedef struct cudaExternalSemaphoreWaitParams hipExternalSemaphoreWaitParams;
 
 typedef struct cudaGraphicsResource hipGraphicsResource;
@@ -1556,6 +1559,9 @@ typedef cudaHostFn_t hipHostFn_t;
 typedef struct cudaHostNodeParams hipHostNodeParams;
 typedef struct cudaKernelNodeParams hipKernelNodeParams;
 typedef struct cudaMemsetParams hipMemsetParams;
+#if CUDA_VERSION >= CUDA_12020
+typedef struct cudaGraphNodeParams hipGraphNodeParams;
+#endif
 
 #if CUDA_VERSION >= CUDA_11040
 typedef struct cudaMemAllocNodeParams hipMemAllocNodeParams;
@@ -3736,6 +3742,10 @@ inline static hipError_t hipGraphInstantiateWithParams(hipGraphExec_t* pGraphExe
     return hipCUDAErrorTohipError(cudaGraphInstantiateWithParams(pGraphExec, graph,
                                                                  instantiateParams));
 }
+
+inline static hipError_t hipGraphExecGetFlags(hipGraphExec_t graphExec, unsigned long long* flags) {
+    return hipCUDAErrorTohipError(cudaGraphExecGetFlags(graphExec, flags));
+}
 #endif
 
 #if CUDA_VERSION >= CUDA_11040
@@ -4152,6 +4162,74 @@ inline static hipError_t hipGraphHostNodeGetParams(hipGraphNode_t node,
     return hipCUDAErrorTohipError(cudaGraphHostNodeGetParams(node, pNodeParams));
 }
 
+inline static hipError_t hipGraphExecExternalSemaphoresSignalNodeSetParams(
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+    const hipExternalSemaphoreSignalNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(
+        cudaGraphExecExternalSemaphoresSignalNodeSetParams(hGraphExec, hNode, nodeParams));
+}
+
+inline static hipError_t hipGraphExecExternalSemaphoresWaitNodeSetParams(
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode,
+    const hipExternalSemaphoreWaitNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(
+        cudaGraphExecExternalSemaphoresWaitNodeSetParams(hGraphExec, hNode, nodeParams));
+}
+
+inline static hipError_t hipGraphAddExternalSemaphoresSignalNode(
+    hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies,
+    size_t numDependencies, const hipExternalSemaphoreSignalNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphAddExternalSemaphoresSignalNode(
+        pGraphNode, graph, pDependencies, numDependencies, nodeParams));
+}
+
+inline static hipError_t hipGraphAddExternalSemaphoresWaitNode(
+    hipGraphNode_t* pGraphNode, hipGraph_t graph, const hipGraphNode_t* pDependencies,
+    size_t numDependencies, const hipExternalSemaphoreWaitNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphAddExternalSemaphoresWaitNode(
+        pGraphNode, graph, pDependencies, numDependencies, nodeParams));
+}
+
+inline static hipError_t hipGraphExternalSemaphoresSignalNodeSetParams(
+    hipGraphNode_t hNode, const hipExternalSemaphoreSignalNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(
+        cudaGraphExternalSemaphoresSignalNodeSetParams(hNode, nodeParams));
+}
+
+inline static hipError_t hipGraphExternalSemaphoresWaitNodeGetParams(
+    hipGraphNode_t hNode, hipExternalSemaphoreWaitNodeParams* paramsOut) {
+    return hipCUDAErrorTohipError(cudaGraphExternalSemaphoresWaitNodeGetParams(hNode, paramsOut));
+}
+
+inline static hipError_t hipGraphExternalSemaphoresWaitNodeSetParams(
+    hipGraphNode_t hNode, const hipExternalSemaphoreWaitNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphExternalSemaphoresWaitNodeSetParams(hNode, nodeParams));
+}
+
+inline static hipError_t hipGraphExternalSemaphoresSignalNodeGetParams(
+    hipGraphNode_t hNode, hipExternalSemaphoreSignalNodeParams* paramsOut) {
+    return hipCUDAErrorTohipError(cudaGraphExternalSemaphoresSignalNodeGetParams(hNode, paramsOut));
+}
+
+#if CUDA_VERSION >= CUDA_12020
+inline static hipError_t hipGraphAddNode(hipGraphNode_t* pGraphNode, hipGraph_t graph,
+                                         const hipGraphNode_t* pDependencies,
+                                         size_t numDependencies, hipGraphNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(
+        cudaGraphAddNode(pGraphNode, graph, pDependencies, numDependencies, nodeParams));
+}
+
+inline static hipError_t hipGraphExecNodeSetParams(hipGraphExec_t graphExec, hipGraphNode_t node,
+                                                   hipGraphNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphExecNodeSetParams(graphExec, node, nodeParams));
+}
+
+inline static hipError_t hipGraphNodeSetParams(hipGraphNode_t node,
+                                               hipGraphNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(cudaGraphNodeSetParams(node, nodeParams));
+}
+#endif
+
 #if CUDA_VERSION >= CUDA_11010
 inline static hipError_t hipGraphMemcpyNodeSetParams1D(hipGraphNode_t node, void* dst,
                                                        const void* src, size_t count,

From f50835c1fd031c39b068e31241b9bffc5368e5e4 Mon Sep 17 00:00:00 2001
From: Branislav Brzak <branislav.brzak@amd.com>
Date: Tue, 24 Sep 2024 12:22:51 +0200
Subject: [PATCH 143/177] SWDEV-483315 - Add hipGraphNodeGetDependentNodes_v2

Change-Id: I1ac0fb50df248521dfaae283bf9017acce146279
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 20d761d058..2b1b4c3767 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3956,6 +3956,14 @@ inline static hipError_t hipGraphNodeGetDependentNodes(hipGraphNode_t node,
     return hipCUDAErrorTohipError(
         cudaGraphNodeGetDependentNodes(node, pDependentNodes, pNumDependentNodes));
 }
+inline static hipError_t hipGraphNodeGetDependentNodes_v2(hipGraphNode_t node,
+                                                          hipGraphNode_t* pDependentNodes,
+                                                          hipGraphEdgeData* edgeData,
+                                                          size_t* pNumDependentNodes) {
+    return hipCUDAErrorTohipError(
+        cudaGraphNodeGetDependentNodes_v2(node, pDependentNodes, edgeData, pNumDependentNodes));
+}
+
 
 inline static hipError_t hipGraphNodeGetType(hipGraphNode_t node, hipGraphNodeType* pType) {
     return hipCUDAErrorTohipError(cudaGraphNodeGetType(node, pType));

From a74f9e1770ad86653343878039608b060eb02c24 Mon Sep 17 00:00:00 2001
From: Ajay GunaShekar <AJAY.GunaShekar@amd.com>
Date: Fri, 27 Sep 2024 15:53:04 -0400
Subject: [PATCH 144/177] Revert "SWDEV-483315 - Add
 hipGraphNodeGetDependentNodes_v2"

This reverts commit f50835c1fd031c39b068e31241b9bffc5368e5e4.

Linux builds silently failing. Please look at the build logs

Change-Id: I1caeff6e5cf48697a36f7680b1ef31bcab93b6df
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 2b1b4c3767..20d761d058 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3956,14 +3956,6 @@ inline static hipError_t hipGraphNodeGetDependentNodes(hipGraphNode_t node,
     return hipCUDAErrorTohipError(
         cudaGraphNodeGetDependentNodes(node, pDependentNodes, pNumDependentNodes));
 }
-inline static hipError_t hipGraphNodeGetDependentNodes_v2(hipGraphNode_t node,
-                                                          hipGraphNode_t* pDependentNodes,
-                                                          hipGraphEdgeData* edgeData,
-                                                          size_t* pNumDependentNodes) {
-    return hipCUDAErrorTohipError(
-        cudaGraphNodeGetDependentNodes_v2(node, pDependentNodes, edgeData, pNumDependentNodes));
-}
-
 
 inline static hipError_t hipGraphNodeGetType(hipGraphNode_t node, hipGraphNodeType* pType) {
     return hipCUDAErrorTohipError(cudaGraphNodeGetType(node, pType));

From e22853eb4d89460682ea89a091e43b315f957a4e Mon Sep 17 00:00:00 2001
From: Branislav Brzak <branislav.brzak@amd.com>
Date: Mon, 30 Sep 2024 11:36:43 +0000
Subject: [PATCH 145/177] SWDEV-483315 - Add hipGraphNodeGetDependentNodes_v2

Change-Id: I418d0c2a5a77b07603e4e481fd9547cf43b03a20
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 20d761d058..4ebf6edf82 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3711,6 +3711,14 @@ inline static hipError_t hipStreamBeginCaptureToGraph(hipStream_t stream, hipGra
     return hipCUDAErrorTohipError(cudaStreamBeginCaptureToGraph(
         stream, graph, dependencies, dependencyData, numDependencies, mode));
 }
+
+inline static hipError_t hipGraphNodeGetDependentNodes_v2(hipGraphNode_t node,
+                                                          hipGraphNode_t* pDependentNodes,
+                                                          hipGraphEdgeData* edgeData,
+                                                          size_t* pNumDependentNodes) {
+    return hipCUDAErrorTohipError(
+        cudaGraphNodeGetDependentNodes_v2(node, pDependentNodes, edgeData, pNumDependentNodes));
+}
 #endif
 inline static hipError_t hipStreamEndCapture(hipStream_t stream, hipGraph_t* pGraph) {
     return hipCUDAErrorTohipError(cudaStreamEndCapture(stream, pGraph));

From bceb8acbc4a46ccd964a59ef93036c79fa1eff2e Mon Sep 17 00:00:00 2001
From: amd-jmacaran <Joseph.Macaranas@amd.com>
Date: Mon, 25 Nov 2024 00:19:15 -0500
Subject: [PATCH 146/177] SWDEV-458516 - External CI: Support commits on both
 staging and mainline

Change-Id: Ie256402db9c5f306eb1feb05a088fa9a5eab9e5f
---
 .azuredevops/rocm-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.azuredevops/rocm-ci.yml b/.azuredevops/rocm-ci.yml
index 41901e3f01..0c2d620d7b 100644
--- a/.azuredevops/rocm-ci.yml
+++ b/.azuredevops/rocm-ci.yml
@@ -8,12 +8,12 @@ resources:
     type: github
     endpoint: ROCm
     name: ROCm/clr
-    ref: amd-staging
+    ref: $(Build.SourceBranch)
   - repository: hipother_repo
     type: github
     endpoint: ROCm
     name: ROCm/HIP # leverage HIP job that builds both AMD and NV backends
-    ref: amd-staging
+    ref: $(Build.SourceBranch)
 
 variables:
 - group: common

From 66367f3ec628b7fb55fd147cba90cdc7e2587eb8 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Thu, 7 Nov 2024 17:40:52 +0200
Subject: [PATCH 147/177] SWDEV-483312 - Add cuda driver API's interfaces

Change-Id: Ib37cc6085898ae9dc86e509503cd52b61c1a5356
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 110 ++++++++++++++++++
 1 file changed, 110 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 4ebf6edf82..cb4dbb9709 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -31,6 +31,7 @@ THE SOFTWARE.
 #include <stdio.h>
 
 #define CUDA_9000 9000
+#define CUDA_10000 10000
 #define CUDA_10010 10010
 #define CUDA_10020 10020
 #define CUDA_11010 11010
@@ -612,6 +613,56 @@ static inline void hipMemcpy3DTocudaMemcpy3D(CUDA_MEMCPY3D* a, const HIP_MEMCPY3
     a->Depth = (size_t)p->Depth;
 }
 
+static inline void cudaMemcpy3DToHipMemcpy3D(HIP_MEMCPY3D* a, const CUDA_MEMCPY3D* p) {
+    a->srcXInBytes = (unsigned int)p->srcXInBytes;
+    a->srcY = (unsigned int)p->srcY;
+    a->srcZ = (unsigned int)p->srcZ;
+    a->srcLOD = (unsigned int)p->srcLOD;
+    switch (p->srcMemoryType) {
+        case CU_MEMORYTYPE_HOST:
+            a->srcMemoryType = hipMemoryTypeHost;
+            break;
+        case CU_MEMORYTYPE_DEVICE:
+            a->srcMemoryType = hipMemoryTypeDevice;
+            break;
+        case CU_MEMORYTYPE_ARRAY:
+            a->srcMemoryType = hipMemoryTypeArray;
+            break;
+        default:
+            a->srcMemoryType = hipMemoryTypeUnified;
+    }
+    a->srcHost = p->srcHost;
+    a->srcDevice =(hipDeviceptr_t)p->srcDevice;
+    a->srcArray = (hipArray_t)p->srcArray;
+    a->srcPitch = (unsigned int)p->srcPitch;
+    a->srcHeight = (unsigned int)p->srcHeight;
+    a->dstXInBytes = (unsigned int)p->dstXInBytes;
+    a->dstY = (unsigned int)p->dstY;
+    a->dstZ = (unsigned int)p->dstZ;
+    a->dstLOD = (unsigned int)p->dstLOD;
+    switch (p->dstMemoryType) {
+        case CU_MEMORYTYPE_HOST:
+            a->dstMemoryType = hipMemoryTypeHost;
+            break;
+        case CU_MEMORYTYPE_DEVICE:
+            a->dstMemoryType = hipMemoryTypeDevice;
+            break;
+        case CU_MEMORYTYPE_ARRAY:
+            a->dstMemoryType = hipMemoryTypeArray;
+            break;
+        default:
+            a->dstMemoryType = hipMemoryTypeUnified;
+    }
+    a->dstHost = p->dstHost;
+    a->dstDevice = (hipDeviceptr_t)p->dstDevice;
+    a->dstArray = (hipArray_t)p->dstArray;
+    a->dstPitch = (unsigned int)p->dstPitch;
+    a->dstHeight = (unsigned int)p->dstHeight;
+    a->WidthInBytes = (unsigned int)p->WidthInBytes;
+    a->Height = (unsigned int)p->Height;
+    a->Depth = (unsigned int)p->Depth;
+}
+
 static inline void hipMemcpy2DTocudaMemcpy2D(CUDA_MEMCPY2D* a, const hip_Memcpy2D* p){
     a->srcXInBytes = (size_t)p->srcXInBytes;
     a->srcY = (size_t)p->srcY;
@@ -4343,6 +4394,65 @@ inline static hipError_t hipDrvGraphAddMemcpyNode(hipGraphNode_t* phGraphNode, h
                                     numDependencies, (const CUDA_MEMCPY3D*)&cudaCopy, ctx)));
     }
 }
+
+#if CUDA_VERSION >= CUDA_10000
+inline static hipError_t hipDrvGraphMemcpyNodeGetParams(hipGraphNode_t hNode,
+                                                        HIP_MEMCPY3D* nodeParams) {
+  if (nodeParams == nullptr) {
+    return hipCUResultTohipError(cuGraphMemcpyNodeGetParams(hNode, nullptr));
+  } else {
+    CUDA_MEMCPY3D cudaCopy = {0};
+    hipError_t err =
+        hipCUResultTohipError(cuGraphMemcpyNodeGetParams(hNode, (CUDA_MEMCPY3D*)&cudaCopy));
+    cudaMemcpy3DToHipMemcpy3D(nodeParams, &cudaCopy);
+    return err;
+  }
+}
+
+inline static hipError_t hipDrvGraphMemcpyNodeSetParams(hipGraphNode_t hNode,
+                                                        HIP_MEMCPY3D* nodeParams) {
+  if (nodeParams == nullptr) {
+    return hipCUResultTohipError(cuGraphMemcpyNodeSetParams(hNode, nullptr));
+  } else {
+    CUDA_MEMCPY3D cudaCopy = {0};
+    hipMemcpy3DTocudaMemcpy3D(&cudaCopy, nodeParams);
+    return hipCUResultTohipError(cuGraphMemcpyNodeSetParams(hNode, (CUDA_MEMCPY3D*)&cudaCopy));
+  }
+}
+#endif
+
+#if CUDA_VERSION >= CUDA_10020
+inline static hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGraphExec,
+                                                            hipGraphNode_t hNode,
+                                                            const HIP_MEMCPY3D* copyParams,
+                                                            hipCtx_t ctx) {
+  if (copyParams == nullptr) {
+    return hipCUResultTohipError(cuGraphExecMemcpyNodeSetParams(hGraphExec, hNode, nullptr, ctx));
+  } else {
+    CUDA_MEMCPY3D cudaCopy = {0};
+    hipMemcpy3DTocudaMemcpy3D(&cudaCopy, copyParams);
+    return hipCUResultTohipError(
+        cuGraphExecMemcpyNodeSetParams(hGraphExec, hNode, (CUDA_MEMCPY3D*)&cudaCopy, ctx));
+  }
+}
+
+inline static hipError_t hipDrvGraphExecMemsetNodeSetParams(
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode, const HIP_MEMSET_NODE_PARAMS* memsetParams,
+    hipCtx_t ctx) {
+  return hipCUResultTohipError(
+      cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, memsetParams, ctx));
+}
+#endif
+
+#if CUDA_VERSION >= CUDA_11040
+inline static hipError_t hipDrvGraphAddMemFreeNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                                   const hipGraphNode_t* dependencies,
+                                                   size_t numDependencies, hipDeviceptr_t dptr) {
+  return hipCUResultTohipError(
+      cuGraphAddMemFreeNode(phGraphNode, hGraph, dependencies, numDependencies, dptr));
+}
+#endif
+
 #endif
 #if CUDA_VERSION >= CUDA_11010
 inline static hipError_t hipGraphUpload(hipGraphExec_t graphExec, hipStream_t stream) {

From f65d41442fef66172828efedacafa33c3d718234 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Thu, 21 Nov 2024 18:50:58 +0200
Subject: [PATCH 148/177] SWDEV-499927 - Fix hipMemAllocationProp

 - The member of the hipMemAllocationProp struct is named requestedHandleTypes. However, in HIP, it is named requestedHandleType. I have decided not to follow CUDA's naming convention, as it would require a lot of changes across HIP repositories (clr, hip, hip-tests).
 - Fix hipMemGetAllocationPropertiesFromHandle to update out parameter properly.

Change-Id: I8417a2e73b0ee9fa5ca0d36546e75cfce82cbf3e
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 26 ++++++++++++++++---
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index cb4dbb9709..5f3babb660 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1729,7 +1729,7 @@ typedef struct hipMemAllocationProp {
     /** Memory allocation type */
     hipMemAllocationType type;
     /** Requested handle type */
-    hipMemAllocationHandleType requestedHandleTypes;
+    hipMemAllocationHandleType requestedHandleType;
     /** Location of allocation */
     hipMemLocation location;
     /**
@@ -2690,7 +2690,7 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
 inline static CUmemAllocationProp hipMemAllocationPropToCUmemAllocationProp(const hipMemAllocationProp* prop) {
     CUmemAllocationProp cuProp;
     cuProp.type = (CUmemAllocationType)prop->type;
-    cuProp.requestedHandleTypes = (CUmemAllocationHandleType)prop->requestedHandleTypes;
+    cuProp.requestedHandleTypes = (CUmemAllocationHandleType)prop->requestedHandleType;
     cuProp.location.type = (CUmemLocationType)prop->location.type;
     cuProp.location.id = prop->location.id;
     cuProp.win32HandleMetaData = prop->win32HandleMetaData;
@@ -2703,6 +2703,22 @@ inline static CUmemAllocationProp hipMemAllocationPropToCUmemAllocationProp(cons
     cuProp.allocFlags.reserved[3] = prop->allocFlags.reserved[3];
     return cuProp;
 }
+inline static hipMemAllocationProp CUmemAllocationPropToHipMemAllocationProp(const CUmemAllocationProp* prop) {
+  hipMemAllocationProp hipProp;
+  hipProp.type = (hipMemAllocationType)prop->type;
+  hipProp.requestedHandleType = (hipMemAllocationHandleType)prop->requestedHandleTypes;
+  hipProp.location.type = (hipMemLocationType)prop->location.type;
+  hipProp.location.id = prop->location.id;
+  hipProp.win32HandleMetaData = prop->win32HandleMetaData;
+  hipProp.allocFlags.compressionType = prop->allocFlags.compressionType;
+  hipProp.allocFlags.gpuDirectRDMACapable = prop->allocFlags.gpuDirectRDMACapable;
+  hipProp.allocFlags.usage = prop->allocFlags.usage;
+  hipProp.allocFlags.reserved[0] = prop->allocFlags.reserved[0];
+  hipProp.allocFlags.reserved[1] = prop->allocFlags.reserved[1];
+  hipProp.allocFlags.reserved[2] = prop->allocFlags.reserved[2];
+  hipProp.allocFlags.reserved[3] = prop->allocFlags.reserved[3];
+  return hipProp;
+}
 inline static CUmemLocation hipMemLocationToCUmemLocation(const hipMemLocation* loc) {
     CUmemLocation cuLoc;
     cuLoc.id = loc->id;
@@ -2756,8 +2772,10 @@ inline static hipError_t hipMemGetAccess(unsigned long long* flags,
 }
 inline static hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop,
                                                                  hipMemGenericAllocationHandle_t handle) {
-    CUmemAllocationProp cuProp = hipMemAllocationPropToCUmemAllocationProp(prop);
-    return hipCUResultTohipError(cuMemGetAllocationPropertiesFromHandle(&cuProp, handle));
+    CUmemAllocationProp cuProp;
+    auto err = cuMemGetAllocationPropertiesFromHandle(&cuProp, handle);
+    *prop = CUmemAllocationPropToHipMemAllocationProp(&cuProp);
+    return hipCUResultTohipError(err);
 }
 inline static hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* handle,
                                                          void* osHandle,

From 0b82c50300ff875ddd0b14e328449e616dc067a2 Mon Sep 17 00:00:00 2001
From: Jimbo Xie <jiabaxie@amd.com>
Date: Wed, 6 Nov 2024 01:28:54 -0500
Subject: [PATCH 149/177] SWDEV-477219 - hipEventRecordwithFlags hipother

Change-Id: I17313697f24ab095f134da0873148962114df5fc
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 5f3babb660..61cb1703fe 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -366,6 +366,9 @@ typedef enum cudaResourceViewFormat hipResourceViewFormat;
 #define hipEventReleaseToDevice 0 /* no-op on CUDA platform */
 #define hipEventReleaseToSystem 0 /* no-op on CUDA platform */
 
+//! Flags that can be used with hipEventRecordWithFlags.
+#define hipEventRecordDefault cudaEventRecordDefault
+#define hipEventRecordExternal cudaEventRecordExternal
 
 #define hipHostMallocDefault cudaHostAllocDefault
 #define hipHostMallocPortable cudaHostAllocPortable
@@ -2926,6 +2929,11 @@ inline static hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream __d
     return hipCUDAErrorTohipError(cudaEventRecord(event, stream));
 }
 
+inline static hipError_t hipEventRecordWithFlags(hipEvent_t event, hipStream_t stream __dparm(0),
+                                                 unsigned int flags __dparm(0))
+    return hipCUDAErrorTohipError(cudaEventRecordWithFlags(event, stream, flags));
+}
+
 inline static hipError_t hipEventSynchronize(hipEvent_t event) {
     return hipCUDAErrorTohipError(cudaEventSynchronize(event));
 }

From 4fd3f70f51718cd464067f647434fb5aada99e74 Mon Sep 17 00:00:00 2001
From: Sourabh Betigeri <sourabh.betigeri@amd.com>
Date: Wed, 11 Sep 2024 19:04:02 -0700
Subject: [PATCH 150/177] SWDEV-484578 SWDEV-484575 SWDEV-484573 SWDEV-483324
 SWDEV-483323 - Adds nvidia mappings

For hipStreamBatchMemOp, hipGraphAddBatchMemOpNode, hipGraphBatchMemOpNodeSetParams,
hipGraphBatchMemOpNodeGetParams, hipGraphExecBatchMemOpNodeSetParams APIs

Change-Id: I54a42fe070ee4a9e0ae41718b79f96aed883db5e
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 178 ++++++++++++++++++
 1 file changed, 178 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 61cb1703fe..237642174f 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -519,6 +519,146 @@ static inline void hipResourceDesTocudaResourceDes(CUDA_RESOURCE_DESC* a, const
     a->flags = p->flags;
 }
 
+
+/** Operations for hipStreamBatchMemOp*/
+typedef enum hipStreamBatchMemOpType {
+    hipStreamMemOpWaitValue32 = 0x1,
+    hipStreamMemOpWriteValue32 = 0x2,
+    hipStreamMemOpWaitValue64 = 0x4,
+    hipStreamMemOpWriteValue64 = 0x5,
+    hipStreamMemOpBarrier = 0x6,            ///< Currently not supported
+    hipStreamMemOpFlushRemoteWrites = 0x3   ///< Currently not supported
+} hipStreamBatchMemOpType;
+
+
+
+inline static CUstreamBatchMemOpType hipStreamBatchMemOpType_enumToCUstreamBatchMemOpType(
+                                     hipStreamBatchMemOpType memOpType) {
+    switch (memOpType) {
+        case hipStreamMemOpWaitValue32:
+            return CU_STREAM_MEM_OP_WAIT_VALUE_32;
+        case hipStreamMemOpWriteValue32:
+            return CU_STREAM_MEM_OP_WRITE_VALUE_32;
+        case hipStreamMemOpWaitValue64:
+            return CU_STREAM_MEM_OP_WAIT_VALUE_64;
+        case hipStreamMemOpWriteValue64:
+            return CU_STREAM_MEM_OP_WRITE_VALUE_64;
+        case hipStreamMemOpBarrier:
+            return CU_STREAM_MEM_OP_BARRIER;
+        case hipStreamMemOpFlushRemoteWrites:
+            return CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
+        default:
+            return CU_STREAM_MEM_OP_WAIT_VALUE_32;
+    }
+}
+
+typedef union hipStreamBatchMemOpParams_union {
+    hipStreamBatchMemOpType operation;
+    struct hipStreamMemOpWaitValueParams_t{
+        hipStreamBatchMemOpType operation;
+        hipDeviceptr_t address;
+    union {
+        uint32_t value;
+        uint64_t value64;
+    };
+    unsigned int flags;
+    hipDeviceptr_t alias;   ///< Not valid for AMD backend. Initial value is unimportant
+  } waitValue;
+  struct hipStreamMemOpWriteValueParams_t{
+      hipStreamBatchMemOpType operation;
+      hipDeviceptr_t address;
+      union {
+          uint32_t value;
+          uint64_t value64;
+      };
+      unsigned int flags;
+      hipDeviceptr_t alias;   ///< Not valid for AMD backend. Initial value is unimportant
+    } writeValue;
+    struct hipStreamMemOpFlushRemoteWritesParams_t{
+        hipStreamBatchMemOpType operation;
+        unsigned int flags;
+    } flushRemoteWrites;    ///< Currently not supported on AMD
+    struct hipStreamMemOpMemoryBarrierParams_t{
+        hipStreamBatchMemOpType operation;
+        unsigned int flags;
+    } memoryBarrier;        ///< Currently not supported on AMD
+    uint64_t pad[6];
+} hipStreamBatchMemOpParams;
+// hipStreamBatchMemOpType
+
+typedef struct hipBatchMemOpNodeParams {
+    hipCtx_t ctx;
+    unsigned int count;
+    hipStreamBatchMemOpParams* paramArray;
+    unsigned int flags;
+} hipBatchMemOpNodeParams;
+
+#define hipStreamBatchMemOpType CUstreamBatchMemOpType
+
+static inline void hipBatchMemOpParamsTocudaBatchMemOpParams(CUstreamBatchMemOpParams* a,
+                                                             const hipStreamBatchMemOpParams* p,
+                                                             unsigned int count) {
+    for (unsigned int i = 0; i < count; i++) {
+        switch (p[i].operation) {
+            case hipStreamMemOpWaitValue32:
+                a[i].operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
+            break;
+            case hipStreamMemOpWriteValue32:
+                a[i].operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
+            break;
+            case hipStreamMemOpWaitValue64:
+                a[i].operation = CU_STREAM_MEM_OP_WAIT_VALUE_64;
+            break;
+            case hipStreamMemOpWriteValue64:
+                a[i].operation = CU_STREAM_MEM_OP_WRITE_VALUE_64;
+            break;
+            case hipStreamMemOpBarrier:
+                a[i].operation = CU_STREAM_MEM_OP_BARRIER;
+            break;
+            case hipStreamMemOpFlushRemoteWrites:
+                a[i].operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
+            break;
+            default:
+                a[i].operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
+            break;
+        }
+    a[i].waitValue.operation = hipStreamBatchMemOpType_enumToCUstreamBatchMemOpType(
+                                                           p[i].waitValue.operation);
+    a[i].waitValue.address = p[i].waitValue.address;
+    a[i].waitValue.value = static_cast<cuuint32_t>(p[i].waitValue.value);
+    a[i].waitValue.value64 = static_cast<cuuint64_t>(p[i].waitValue.value64);
+    a[i].waitValue.flags = p[i].waitValue.flags;
+    a[i].waitValue.alias = (CUdeviceptr)p[i].waitValue.alias;
+
+    a[i].writeValue.operation = hipStreamBatchMemOpType_enumToCUstreamBatchMemOpType(
+                                                            p[i].waitValue.operation);
+    a[i].writeValue.address = p[i].writeValue.address;
+    a[i].writeValue.value = static_cast<cuuint32_t>(p[i].writeValue.value);
+    a[i].writeValue.value64 = static_cast<cuuint64_t>(p[i].writeValue.value64);
+    a[i].writeValue.flags = p[i].writeValue.flags;
+    a[i].writeValue.alias = (CUdeviceptr)p[i].writeValue.alias;
+
+    a[i].flushRemoteWrites.operation = hipStreamBatchMemOpType_enumToCUstreamBatchMemOpType(
+                                                           p[i].flushRemoteWrites.operation);
+    a[i].flushRemoteWrites.flags = p[i].flushRemoteWrites.flags;
+
+    a[i].memoryBarrier.operation = hipStreamBatchMemOpType_enumToCUstreamBatchMemOpType(
+                                                           p[i].memoryBarrier.operation);
+    a[i].memoryBarrier.flags = p[i].memoryBarrier.flags;
+  }
+}
+
+static inline void hipBatchMemOpNodeParamsTocudaBatchMemOpNodeParams(
+                   CUDA_BATCH_MEM_OP_NODE_PARAMS* a,
+                   const hipBatchMemOpNodeParams* p) {
+    CUstreamBatchMemOpParams cuParamArray[p->count];
+    hipBatchMemOpParamsTocudaBatchMemOpParams(cuParamArray, p->paramArray, p->count);
+    a->ctx = (CUcontext)p->ctx;
+    a->count = p->count;
+    a->paramArray = cuParamArray;
+    a->flags = p->flags;
+}
+
 typedef struct hip_Memcpy2D {
     size_t srcXInBytes;
     size_t srcY;
@@ -4016,6 +4156,44 @@ inline static hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, int
                                                      static_cast<cuuint64_t>(value), flags));
 }
 
+inline static hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count,
+                                             hipStreamBatchMemOpParams* paramArray,
+                                             unsigned int flags) {
+    CUstreamBatchMemOpParams cuParamArray[count];
+    hipBatchMemOpParamsTocudaBatchMemOpParams(cuParamArray, paramArray, count);
+    return hipCUResultTohipError(cuStreamBatchMemOp(stream, count, cuParamArray, flags));
+}
+
+inline static hipError_t hipGraphAddBatchMemOpNode(hipGraphNode_t *phGraphNode, hipGraph_t hGraph,
+                                                   const hipGraphNode_t *dependencies,
+                                                   size_t numDependencies,
+                                                   const hipBatchMemOpNodeParams* nodeParams) {
+    CUDA_BATCH_MEM_OP_NODE_PARAMS cuBatchMemOpNodeParams;
+    hipBatchMemOpNodeParamsTocudaBatchMemOpNodeParams(&cuBatchMemOpNodeParams, nodeParams);
+    return hipCUDAErrorTohipError(cuGraphAddBatchMemOpNode(phGraphNode, hGraph, dependencies,
+                                  numDependencies,
+                                  (const CUDA_BATCH_MEM_OP_NODE_PARAMS*)&cuBatchMemOpNodeParams));
+}
+
+inline static hipError_t hipGraphBatchMemOpNodeGetParams(hipGraphNode_t hNode,
+                                                         hipBatchMemOpNodeParams* nodeParams_out) {
+    return hipCUDAErrorTohipError(cuGraphBatchMemOpNodeGetParams(hNode, nodeParams_out));
+}
+
+inline static hipError_t hipGraphBatchMemOpNodeSetParams(hipGraphNode_t hNode,
+                                                         hipBatchMemOpNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(cuGraphBatchMemOpNodeSetParams (hNode,
+                                            (const CUDA_BATCH_MEM_OP_NODE_PARAMS*)nodeParams));
+}
+
+inline static hipError_t hipGraphExecBatchMemOpNodeSetParams(hipGraphExec_t hGraphExec,
+                                                    hipGraphNode_t hNode,
+                                                    const hipBatchMemOpNodeParams* nodeParams) {
+    return hipCUDAErrorTohipError(cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode,
+                                                (const CUDA_BATCH_MEM_OP_NODE_PARAMS*)nodeParams));
+}
+
+
 inline static hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* from,
                                                     const hipGraphNode_t* to,
                                                     size_t numDependencies) {

From 86fbfba69e07382248faddb7b6cb466fe14248e5 Mon Sep 17 00:00:00 2001
From: Jaydeep Patel <jaydeepkumar.patel@amd.com>
Date: Tue, 10 Dec 2024 15:28:33 +0000
Subject: [PATCH 151/177] SWDEV-477219 - Add missing curly brace.

Change-Id: I58268d4c2f3cdc0d647deea51b250117e77d82ae
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 237642174f..d58a5fcafd 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -3070,7 +3070,7 @@ inline static hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream __d
 }
 
 inline static hipError_t hipEventRecordWithFlags(hipEvent_t event, hipStream_t stream __dparm(0),
-                                                 unsigned int flags __dparm(0))
+                                                 unsigned int flags __dparm(0)) {
     return hipCUDAErrorTohipError(cudaEventRecordWithFlags(event, stream, flags));
 }
 

From 1d96ab69ee4ee0aa9d9017b8f5c20d3b7cdd12aa Mon Sep 17 00:00:00 2001
From: Sourabh Betigeri <sourabh.betigeri@amd.com>
Date: Wed, 11 Dec 2024 13:28:59 +0000
Subject: [PATCH 152/177] SWDEV-484578 SWDEV-484575 SWDEV-484573 SWDEV-483324
 SWDEV-483323 - Fixes issues in nvidia mappings for batch mem ops

Change-Id: I6202ea5691b8256e004650d2689c2826a53d8113
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 229 +++++++++---------
 1 file changed, 114 insertions(+), 115 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index d58a5fcafd..d5347684e8 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -519,69 +519,66 @@ static inline void hipResourceDesTocudaResourceDes(CUDA_RESOURCE_DESC* a, const
     a->flags = p->flags;
 }
 
-
 /** Operations for hipStreamBatchMemOp*/
 typedef enum hipStreamBatchMemOpType {
     hipStreamMemOpWaitValue32 = 0x1,
     hipStreamMemOpWriteValue32 = 0x2,
     hipStreamMemOpWaitValue64 = 0x4,
     hipStreamMemOpWriteValue64 = 0x5,
-    hipStreamMemOpBarrier = 0x6,            ///< Currently not supported
-    hipStreamMemOpFlushRemoteWrites = 0x3   ///< Currently not supported
+    hipStreamMemOpBarrier = 0x6,          ///< Currently not supported
+    hipStreamMemOpFlushRemoteWrites = 0x3 ///< Currently not supported
 } hipStreamBatchMemOpType;
 
-
-
 inline static CUstreamBatchMemOpType hipStreamBatchMemOpType_enumToCUstreamBatchMemOpType(
-                                     hipStreamBatchMemOpType memOpType) {
+    hipStreamBatchMemOpType memOpType) {
     switch (memOpType) {
-        case hipStreamMemOpWaitValue32:
-            return CU_STREAM_MEM_OP_WAIT_VALUE_32;
-        case hipStreamMemOpWriteValue32:
-            return CU_STREAM_MEM_OP_WRITE_VALUE_32;
-        case hipStreamMemOpWaitValue64:
-            return CU_STREAM_MEM_OP_WAIT_VALUE_64;
-        case hipStreamMemOpWriteValue64:
-            return CU_STREAM_MEM_OP_WRITE_VALUE_64;
-        case hipStreamMemOpBarrier:
-            return CU_STREAM_MEM_OP_BARRIER;
-        case hipStreamMemOpFlushRemoteWrites:
-            return CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
-        default:
-            return CU_STREAM_MEM_OP_WAIT_VALUE_32;
+    case hipStreamMemOpWaitValue32:
+        return CU_STREAM_MEM_OP_WAIT_VALUE_32;
+    case hipStreamMemOpWriteValue32:
+        return CU_STREAM_MEM_OP_WRITE_VALUE_32;
+    case hipStreamMemOpWaitValue64:
+        return CU_STREAM_MEM_OP_WAIT_VALUE_64;
+    case hipStreamMemOpWriteValue64:
+        return CU_STREAM_MEM_OP_WRITE_VALUE_64;
+    case hipStreamMemOpBarrier:
+        return CU_STREAM_MEM_OP_BARRIER;
+    case hipStreamMemOpFlushRemoteWrites:
+        return CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
+    default:
+        return CU_STREAM_MEM_OP_WAIT_VALUE_32;
     }
 }
 
 typedef union hipStreamBatchMemOpParams_union {
     hipStreamBatchMemOpType operation;
-    struct hipStreamMemOpWaitValueParams_t{
+    struct hipStreamMemOpWaitValueParams_t {
         hipStreamBatchMemOpType operation;
         hipDeviceptr_t address;
-    union {
-        uint32_t value;
-        uint64_t value64;
-    };
-    unsigned int flags;
-    hipDeviceptr_t alias;   ///< Not valid for AMD backend. Initial value is unimportant
-  } waitValue;
-  struct hipStreamMemOpWriteValueParams_t{
-      hipStreamBatchMemOpType operation;
-      hipDeviceptr_t address;
-      union {
-          uint32_t value;
-          uint64_t value64;
-      };
-      unsigned int flags;
-      hipDeviceptr_t alias;   ///< Not valid for AMD backend. Initial value is unimportant
+        union {
+            uint32_t value;
+            uint64_t value64;
+        };
+        unsigned int flags;
+        hipDeviceptr_t alias; ///< Not valid for AMD backend. Initial value is unimportant
+    } waitValue;
+    struct hipStreamMemOpWriteValueParams_t {
+        hipStreamBatchMemOpType operation;
+        hipDeviceptr_t address;
+        union {
+            uint32_t value;
+            uint64_t value64;
+        };
+        unsigned int flags;
+        hipDeviceptr_t alias; ///< Not valid for AMD backend. Initial value is unimportant
     } writeValue;
-    struct hipStreamMemOpFlushRemoteWritesParams_t{
+    struct hipStreamMemOpFlushRemoteWritesParams_t {
         hipStreamBatchMemOpType operation;
         unsigned int flags;
-    } flushRemoteWrites;    ///< Currently not supported on AMD
-    struct hipStreamMemOpMemoryBarrierParams_t{
+    } flushRemoteWrites; ///< Currently not supported on AMD
+    struct hipStreamMemOpMemoryBarrierParams_t {
         hipStreamBatchMemOpType operation;
         unsigned int flags;
-    } memoryBarrier;        ///< Currently not supported on AMD
+    } memoryBarrier; ///< Currently not supported on AMD
     uint64_t pad[6];
 } hipStreamBatchMemOpParams;
 // hipStreamBatchMemOpType
@@ -589,7 +586,7 @@ typedef union hipStreamBatchMemOpParams_union {
 typedef struct hipBatchMemOpNodeParams {
     hipCtx_t ctx;
     unsigned int count;
-    hipStreamBatchMemOpParams* paramArray;
+    hipStreamBatchMemOpParams *paramArray;
     unsigned int flags;
 } hipBatchMemOpNodeParams;
 
@@ -599,64 +596,43 @@ static inline void hipBatchMemOpParamsTocudaBatchMemOpParams(CUstreamBatchMemOpP
                                                              const hipStreamBatchMemOpParams* p,
                                                              unsigned int count) {
     for (unsigned int i = 0; i < count; i++) {
-        switch (p[i].operation) {
-            case hipStreamMemOpWaitValue32:
-                a[i].operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
-            break;
-            case hipStreamMemOpWriteValue32:
-                a[i].operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
-            break;
-            case hipStreamMemOpWaitValue64:
-                a[i].operation = CU_STREAM_MEM_OP_WAIT_VALUE_64;
-            break;
-            case hipStreamMemOpWriteValue64:
-                a[i].operation = CU_STREAM_MEM_OP_WRITE_VALUE_64;
-            break;
-            case hipStreamMemOpBarrier:
-                a[i].operation = CU_STREAM_MEM_OP_BARRIER;
-            break;
-            case hipStreamMemOpFlushRemoteWrites:
-                a[i].operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
-            break;
-            default:
-                a[i].operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
-            break;
+        if (p[i].waitValue.operation == hipStreamMemOpWaitValue32) {
+            a[i].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_32;
+            a[i].waitValue.address = p[i].waitValue.address;
+            a[i].waitValue.value = (cuuint32_t)(p[i].waitValue.value);
+            a[i].waitValue.flags = p[i].waitValue.flags;
+            a[i].waitValue.alias = (CUdeviceptr)(p[i].waitValue.alias);
         }
-    a[i].waitValue.operation = hipStreamBatchMemOpType_enumToCUstreamBatchMemOpType(
-                                                           p[i].waitValue.operation);
-    a[i].waitValue.address = p[i].waitValue.address;
-    a[i].waitValue.value = static_cast<cuuint32_t>(p[i].waitValue.value);
-    a[i].waitValue.value64 = static_cast<cuuint64_t>(p[i].waitValue.value64);
-    a[i].waitValue.flags = p[i].waitValue.flags;
-    a[i].waitValue.alias = (CUdeviceptr)p[i].waitValue.alias;
-
-    a[i].writeValue.operation = hipStreamBatchMemOpType_enumToCUstreamBatchMemOpType(
-                                                            p[i].waitValue.operation);
-    a[i].writeValue.address = p[i].writeValue.address;
-    a[i].writeValue.value = static_cast<cuuint32_t>(p[i].writeValue.value);
-    a[i].writeValue.value64 = static_cast<cuuint64_t>(p[i].writeValue.value64);
-    a[i].writeValue.flags = p[i].writeValue.flags;
-    a[i].writeValue.alias = (CUdeviceptr)p[i].writeValue.alias;
-
-    a[i].flushRemoteWrites.operation = hipStreamBatchMemOpType_enumToCUstreamBatchMemOpType(
-                                                           p[i].flushRemoteWrites.operation);
-    a[i].flushRemoteWrites.flags = p[i].flushRemoteWrites.flags;
-
-    a[i].memoryBarrier.operation = hipStreamBatchMemOpType_enumToCUstreamBatchMemOpType(
-                                                           p[i].memoryBarrier.operation);
-    a[i].memoryBarrier.flags = p[i].memoryBarrier.flags;
-  }
-}
-
-static inline void hipBatchMemOpNodeParamsTocudaBatchMemOpNodeParams(
-                   CUDA_BATCH_MEM_OP_NODE_PARAMS* a,
-                   const hipBatchMemOpNodeParams* p) {
-    CUstreamBatchMemOpParams cuParamArray[p->count];
-    hipBatchMemOpParamsTocudaBatchMemOpParams(cuParamArray, p->paramArray, p->count);
-    a->ctx = (CUcontext)p->ctx;
-    a->count = p->count;
-    a->paramArray = cuParamArray;
-    a->flags = p->flags;
+        else if (p[i].writeValue.operation == hipStreamMemOpWriteValue32) {
+            a[i].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_32;
+            a[i].writeValue.address = p[i].writeValue.address;
+            a[i].writeValue.value = (cuuint32_t)(p[i].writeValue.value);
+            a[i].writeValue.flags = p[i].writeValue.flags;
+            a[i].writeValue.alias = (CUdeviceptr)(p[i].writeValue.alias);
+        }
+        else if (p[i].waitValue.operation == hipStreamMemOpWaitValue64) {
+            a[i].waitValue.operation = CU_STREAM_MEM_OP_WAIT_VALUE_64;
+            a[i].waitValue.address = p[i].waitValue.address;
+            a[i].waitValue.value64 = (cuuint64_t)(p[i].waitValue.value64);
+            a[i].waitValue.flags = p[i].waitValue.flags;
+            a[i].waitValue.alias = (CUdeviceptr)(p[i].waitValue.alias);
+        }
+        else if (p[i].writeValue.operation == hipStreamMemOpWriteValue64) {
+            a[i].writeValue.operation = CU_STREAM_MEM_OP_WRITE_VALUE_64;
+            a[i].writeValue.address = p[i].writeValue.address;
+            a[i].writeValue.value64 = (cuuint64_t)(p[i].writeValue.value64);
+            a[i].writeValue.flags = p[i].writeValue.flags;
+            a[i].writeValue.alias = (CUdeviceptr)(p[i].writeValue.alias);
+        }
+        else if (p[i].memoryBarrier.operation == hipStreamMemOpBarrier) {
+            a[i].memoryBarrier.operation == CU_STREAM_MEM_OP_BARRIER;
+            a[i].memoryBarrier.flags = p[i].memoryBarrier.flags;
+        }
+        else if (p[i].flushRemoteWrites.operation == hipStreamMemOpFlushRemoteWrites) {
+            a[i].flushRemoteWrites.operation = CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES;
+            a[i].flushRemoteWrites.flags = p[i].flushRemoteWrites.flags;
+        }
+    }
 }
 
 typedef struct hip_Memcpy2D {
@@ -4159,41 +4135,64 @@ inline static hipError_t hipStreamWaitValue64(hipStream_t stream, void* ptr, int
 inline static hipError_t hipStreamBatchMemOp(hipStream_t stream, unsigned int count,
                                              hipStreamBatchMemOpParams* paramArray,
                                              unsigned int flags) {
-    CUstreamBatchMemOpParams cuParamArray[count];
+    CUstreamBatchMemOpParams* cuParamArray = new CUstreamBatchMemOpParams[count];
     hipBatchMemOpParamsTocudaBatchMemOpParams(cuParamArray, paramArray, count);
     return hipCUResultTohipError(cuStreamBatchMemOp(stream, count, cuParamArray, flags));
 }
 
-inline static hipError_t hipGraphAddBatchMemOpNode(hipGraphNode_t *phGraphNode, hipGraph_t hGraph,
-                                                   const hipGraphNode_t *dependencies,
+inline static hipError_t hipGraphAddBatchMemOpNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
+                                                   const hipGraphNode_t* dependencies,
                                                    size_t numDependencies,
                                                    const hipBatchMemOpNodeParams* nodeParams) {
     CUDA_BATCH_MEM_OP_NODE_PARAMS cuBatchMemOpNodeParams;
-    hipBatchMemOpNodeParamsTocudaBatchMemOpNodeParams(&cuBatchMemOpNodeParams, nodeParams);
-    return hipCUDAErrorTohipError(cuGraphAddBatchMemOpNode(phGraphNode, hGraph, dependencies,
-                                  numDependencies,
-                                  (const CUDA_BATCH_MEM_OP_NODE_PARAMS*)&cuBatchMemOpNodeParams));
+    CUstreamBatchMemOpParams* cuParamArray = new CUstreamBatchMemOpParams[nodeParams->count];
+    hipBatchMemOpParamsTocudaBatchMemOpParams(
+        cuParamArray, nodeParams->paramArray, nodeParams->count);
+    cuBatchMemOpNodeParams.ctx = (CUcontext)nodeParams->ctx;
+    cuBatchMemOpNodeParams.count = nodeParams->count;
+    cuBatchMemOpNodeParams.paramArray = cuParamArray;
+    cuBatchMemOpNodeParams.flags = nodeParams->flags;
+    return hipCUResultTohipError(cuGraphAddBatchMemOpNode(phGraphNode, hGraph, dependencies,
+                                                          numDependencies,
+                                                          &cuBatchMemOpNodeParams));
+    delete[] cuParamArray;
 }
 
 inline static hipError_t hipGraphBatchMemOpNodeGetParams(hipGraphNode_t hNode,
                                                          hipBatchMemOpNodeParams* nodeParams_out) {
-    return hipCUDAErrorTohipError(cuGraphBatchMemOpNodeGetParams(hNode, nodeParams_out));
+    return hipCUResultTohipError(cuGraphBatchMemOpNodeGetParams(
+        hNode, (CUDA_BATCH_MEM_OP_NODE_PARAMS *)nodeParams_out));
 }
 
 inline static hipError_t hipGraphBatchMemOpNodeSetParams(hipGraphNode_t hNode,
                                                          hipBatchMemOpNodeParams* nodeParams) {
-    return hipCUDAErrorTohipError(cuGraphBatchMemOpNodeSetParams (hNode,
-                                            (const CUDA_BATCH_MEM_OP_NODE_PARAMS*)nodeParams));
+    CUstreamBatchMemOpParams* cuParamArray = new CUstreamBatchMemOpParams[nodeParams->count];
+    hipBatchMemOpParamsTocudaBatchMemOpParams(
+        cuParamArray, nodeParams->paramArray, nodeParams->count);
+    CUDA_BATCH_MEM_OP_NODE_PARAMS cuBatchMemOpNodeParams;
+    cuBatchMemOpNodeParams.ctx = (CUcontext)nodeParams->ctx;
+    cuBatchMemOpNodeParams.count = nodeParams->count;
+    cuBatchMemOpNodeParams.paramArray = cuParamArray;
+    cuBatchMemOpNodeParams.flags = nodeParams->flags;
+    return hipCUResultTohipError(cuGraphBatchMemOpNodeSetParams(hNode, &cuBatchMemOpNodeParams));
+    delete[] cuParamArray;
 }
 
-inline static hipError_t hipGraphExecBatchMemOpNodeSetParams(hipGraphExec_t hGraphExec,
-                                                    hipGraphNode_t hNode,
-                                                    const hipBatchMemOpNodeParams* nodeParams) {
-    return hipCUDAErrorTohipError(cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode,
-                                                (const CUDA_BATCH_MEM_OP_NODE_PARAMS*)nodeParams));
+inline static hipError_t hipGraphExecBatchMemOpNodeSetParams(
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode, const hipBatchMemOpNodeParams* nodeParams) {
+    CUstreamBatchMemOpParams* cuParamArray = new CUstreamBatchMemOpParams[nodeParams->count];
+    hipBatchMemOpParamsTocudaBatchMemOpParams(
+        cuParamArray, nodeParams->paramArray, nodeParams->count);
+    CUDA_BATCH_MEM_OP_NODE_PARAMS cuBatchMemOpNodeParams;
+    cuBatchMemOpNodeParams.ctx = (CUcontext)nodeParams->ctx;
+    cuBatchMemOpNodeParams.count = nodeParams->count;
+    cuBatchMemOpNodeParams.paramArray = cuParamArray;
+    cuBatchMemOpNodeParams.flags = nodeParams->flags;
+    return hipCUResultTohipError(cuGraphExecBatchMemOpNodeSetParams(hGraphExec, hNode,
+                                                                    &cuBatchMemOpNodeParams));
+    delete[] cuParamArray;
 }
 
-
 inline static hipError_t hipGraphRemoveDependencies(hipGraph_t graph, const hipGraphNode_t* from,
                                                     const hipGraphNode_t* to,
                                                     size_t numDependencies) {

From d38eb853a0efdf7404f463b75af7d31fddf45960 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Fri, 13 Dec 2024 13:40:42 +0200
Subject: [PATCH 153/177] SWDEV-499927 - Added nullptr checks to prevent
 segfaults

Change-Id: I6c721340d195f803a48187009d714c94587f098d
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 27 ++++++++++++-------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index d5347684e8..769f4f9d56 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2883,18 +2883,25 @@ inline static hipError_t hipMemExportToShareableHandle(void* shareableHandle,
                                                        unsigned long long flags) {
     return hipCUResultTohipError(cuMemExportToShareableHandle(shareableHandle, handle, (CUmemAllocationHandleType)handleType, flags));
 }
-inline static hipError_t hipMemGetAccess(unsigned long long* flags,
-                                         const hipMemLocation* location,
+inline static hipError_t hipMemGetAccess(unsigned long long* flags, const hipMemLocation* location,
                                          hipDeviceptr_t ptr) {
-    CUmemLocation loc = hipMemLocationToCUmemLocation(location);
-    return hipCUResultTohipError(cuMemGetAccess(flags, &loc, ptr));
+    if (location == NULL) {
+        return hipCUResultTohipError(cuMemGetAccess(flags, NULL, ptr));
+    } else {
+        CUmemLocation loc = hipMemLocationToCUmemLocation(location);
+        return hipCUResultTohipError(cuMemGetAccess(flags, &loc, ptr));
+    }
 }
-inline static hipError_t hipMemGetAllocationPropertiesFromHandle(hipMemAllocationProp* prop,
-                                                                 hipMemGenericAllocationHandle_t handle) {
-    CUmemAllocationProp cuProp;
-    auto err = cuMemGetAllocationPropertiesFromHandle(&cuProp, handle);
-    *prop = CUmemAllocationPropToHipMemAllocationProp(&cuProp);
-    return hipCUResultTohipError(err);
+inline static hipError_t hipMemGetAllocationPropertiesFromHandle(
+    hipMemAllocationProp* prop, hipMemGenericAllocationHandle_t handle) {
+    if (prop == NULL) {
+        return hipCUResultTohipError(cuMemGetAllocationPropertiesFromHandle(NULL, handle));
+    } else {
+        CUmemAllocationProp cuProp;
+        auto result = cuMemGetAllocationPropertiesFromHandle(&cuProp, handle);
+        *prop = CUmemAllocationPropToHipMemAllocationProp(&cuProp);
+        return hipCUResultTohipError(result);
+    }
 }
 inline static hipError_t hipMemImportFromShareableHandle(hipMemGenericAllocationHandle_t* handle,
                                                          void* osHandle,

From 011c022e70199af6cf5663e2540345a1cdd25943 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Mon, 23 Dec 2024 22:24:41 +0530
Subject: [PATCH 154/177] SWDEV-499927 - Fix param handling in hipMemAccess
 APIs

Change-Id: If7c168e28fc94137abf33e4083b7af9515f24298
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 769f4f9d56..1169bd5ea1 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2844,11 +2844,14 @@ inline static CUmemLocation hipMemLocationToCUmemLocation(const hipMemLocation*
     cuLoc.type = (CUmemLocationType)loc->type;
     return cuLoc;
 }
-inline static CUmemAccessDesc hipMemAccessDescToCUmemAccessDesc(const hipMemAccessDesc* desc) {
-    CUmemAccessDesc cuDesc;
-    cuDesc.flags = (CUmemAccess_flags)desc->flags;
-    cuDesc.location.id = (desc->location).id;
-    cuDesc.location.type = (CUmemLocationType)((desc->location).type);
+inline static CUmemAccessDesc* hipMemAccessDescToCUmemAccessDesc(const hipMemAccessDesc* desc,
+                                                                size_t count) {
+    CUmemAccessDesc* cuDesc = (CUmemAccessDesc*)malloc(sizeof(CUmemAccessDesc) * count);
+    for (int i = 0; i < count; i++) {
+        cuDesc[i].flags = (CUmemAccess_flags)desc[i].flags;
+        cuDesc[i].location.id = (desc[i].location).id;
+        cuDesc[i].location.type = (CUmemLocationType)((desc[i].location).type);
+    }
     return cuDesc;
 }
 inline static hipError_t hipMemGetAllocationGranularity(size_t* granularity,
@@ -2925,8 +2928,14 @@ inline static hipError_t hipMemRetainAllocationHandle(hipMemGenericAllocationHan
 inline static hipError_t hipMemSetAccess(hipDeviceptr_t ptr, size_t size,
                                          const hipMemAccessDesc* desc,
                                          size_t count) {
-    CUmemAccessDesc cuDesc = hipMemAccessDescToCUmemAccessDesc(desc);
-    return hipCUResultTohipError(cuMemSetAccess(ptr, size, &cuDesc, count));
+    if (desc == NULL) {
+      return hipCUResultTohipError(cuMemSetAccess(ptr, size, NULL, count));
+    } else {
+      CUmemAccessDesc* cuDesc = hipMemAccessDescToCUmemAccessDesc(desc, count);
+      auto status = hipCUResultTohipError(cuMemSetAccess(ptr, size, cuDesc, count));
+      free(cuDesc);
+      return status;
+    }
 }
 inline static hipError_t hipMemUnmap(hipDeviceptr_t ptr, size_t size) {
     return hipCUResultTohipError(cuMemUnmap(ptr, size));

From 55c17fb4dcaadfb067e1cce92f9628ed03e69b33 Mon Sep 17 00:00:00 2001
From: victzhan <victzhan@amd.com>
Date: Tue, 26 Nov 2024 15:18:03 -0500
Subject: [PATCH 155/177] SWDEV-477218 - add
 hipDeviceGetTexture1DLinearMaxWidth API

Change-Id: I0532360ef298cb2353ae0182b17f8024e7bb3dae
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 1169bd5ea1..792b965162 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -4530,6 +4530,14 @@ inline static hipError_t hipDeviceGetGraphMemAttribute(int device, hipGraphMemAt
     return hipCUDAErrorTohipError(cudaDeviceGetGraphMemAttribute(device, attr, value));
 }
 
+inline static hipError_t hipDeviceGetTexture1DLinearMaxWidth(size_t* maxWidthInElements,
+                                                             const hipChannelFormatDesc* fmtDesc,
+                                                             int device) {
+    return hipCUDAErrorTohipError(cudaDeviceGetTexture1DLinearMaxWidth(maxWidthInElements,
+                                                                       fmtDesc,
+                                                                       device));
+}
+
 inline static hipError_t hipDeviceSetGraphMemAttribute(int device, hipGraphMemAttributeType attr, void* value) {
     return hipCUDAErrorTohipError(cudaDeviceSetGraphMemAttribute(device, attr, value));
 }

From aea1990fe05f0650775df32b475cd0519deb8dd5 Mon Sep 17 00:00:00 2001
From: Marko Arandjelovic <Marko.Arandjelovic@amd.com>
Date: Thu, 20 Feb 2025 22:24:41 +0530
Subject: [PATCH 156/177] SWDEV-499927 - Add nullchecks to hipMemCreate and
 GetAllocationGranularity

Change-Id: I9bb879ce2c702e660ae5e6372cb27fdb8600566b
---
 .../hip/nvidia_detail/nvidia_hip_runtime_api.h   | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 792b965162..f63e0afba5 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2857,15 +2857,23 @@ inline static CUmemAccessDesc* hipMemAccessDescToCUmemAccessDesc(const hipMemAcc
 inline static hipError_t hipMemGetAllocationGranularity(size_t* granularity,
                                                         const hipMemAllocationProp* prop,
                                                         hipMemAllocationGranularity_flags option) {
-    CUmemAllocationProp cuProp = hipMemAllocationPropToCUmemAllocationProp(prop);
-    return hipCUResultTohipError(cuMemGetAllocationGranularity(granularity, &cuProp, option));
+    if (prop == NULL) {
+        return hipCUResultTohipError(cuMemGetAllocationGranularity(granularity, NULL, option));
+    } else {
+        CUmemAllocationProp cuProp = hipMemAllocationPropToCUmemAllocationProp(prop);
+        return hipCUResultTohipError(cuMemGetAllocationGranularity(granularity, &cuProp, option));
+    }
 }
 inline static hipError_t hipMemCreate(hipMemGenericAllocationHandle_t* handle,
                                       size_t size,
                                       const hipMemAllocationProp* prop,
                                       unsigned long long flags) {
-    CUmemAllocationProp cuProp = hipMemAllocationPropToCUmemAllocationProp(prop);
-    return hipCUResultTohipError(cuMemCreate(handle, size, &cuProp, flags));
+    if (prop == NULL) {
+        return hipCUResultTohipError(cuMemCreate(handle, size, NULL, flags));
+    } else {
+        CUmemAllocationProp cuProp = hipMemAllocationPropToCUmemAllocationProp(prop);
+        return hipCUResultTohipError(cuMemCreate(handle, size, &cuProp, flags));
+    }
 }
 inline static hipError_t hipMemRelease(hipMemGenericAllocationHandle_t handle) {
     return hipCUResultTohipError(cuMemRelease(handle));

From f327d4f809f2f98d3af009c103f3e17b7220a931 Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Mon, 3 Feb 2025 14:58:53 -0500
Subject: [PATCH 157/177] SWDEV - 508961 - Update requestedHandleType in
 hipother repos

Change-Id: I50b91dfb42bd2a4daae80a3353b642e9df03fe46
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index f63e0afba5..575904e1d3 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1848,7 +1848,7 @@ typedef struct hipMemAllocationProp {
     /** Memory allocation type */
     hipMemAllocationType type;
     /** Requested handle type */
-    hipMemAllocationHandleType requestedHandleType;
+    hipMemAllocationHandleType requestedHandleTypes;
     /** Location of allocation */
     hipMemLocation location;
     /**
@@ -2809,7 +2809,7 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
 inline static CUmemAllocationProp hipMemAllocationPropToCUmemAllocationProp(const hipMemAllocationProp* prop) {
     CUmemAllocationProp cuProp;
     cuProp.type = (CUmemAllocationType)prop->type;
-    cuProp.requestedHandleTypes = (CUmemAllocationHandleType)prop->requestedHandleType;
+    cuProp.requestedHandleTypes = (CUmemAllocationHandleType)prop->requestedHandleTypes;
     cuProp.location.type = (CUmemLocationType)prop->location.type;
     cuProp.location.id = prop->location.id;
     cuProp.win32HandleMetaData = prop->win32HandleMetaData;
@@ -2825,7 +2825,7 @@ inline static CUmemAllocationProp hipMemAllocationPropToCUmemAllocationProp(cons
 inline static hipMemAllocationProp CUmemAllocationPropToHipMemAllocationProp(const CUmemAllocationProp* prop) {
   hipMemAllocationProp hipProp;
   hipProp.type = (hipMemAllocationType)prop->type;
-  hipProp.requestedHandleType = (hipMemAllocationHandleType)prop->requestedHandleTypes;
+  hipProp.requestedHandleTypes = (hipMemAllocationHandleType)prop->requestedHandleTypes;
   hipProp.location.type = (hipMemLocationType)prop->location.type;
   hipProp.location.id = prop->location.id;
   hipProp.win32HandleMetaData = prop->win32HandleMetaData;

From 3cdab14a23e073fa7134a7808d4d80d13c51164d Mon Sep 17 00:00:00 2001
From: Julia Jiang <julia.jiang@amd.com>
Date: Mon, 3 Feb 2025 14:26:27 -0500
Subject: [PATCH 158/177] SWDEV-509855 - Update hipDeviceAttributePciDomainID
 in hipother

Change-Id: Ib245c90c78a27ea48875aa183c1727a17b62c936
---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 575904e1d3..bf53844646 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -2781,7 +2781,7 @@ inline static hipError_t hipDeviceGetAttribute(int* pi, hipDeviceAttribute_t att
         case hipDeviceAttributeMaxTextureCubemap:
             cdattr = cudaDevAttrMaxTextureCubemapWidth;
             break;
-        case hipDeviceAttributePciDomainID:
+        case hipDeviceAttributePciDomainId:
             cdattr = cudaDevAttrPciDomainId;
             break;
         case hipDeviceAttributePersistingL2CacheMaxSize:

From 95ee23540d62301197400e7d456cd958c9ffb43d Mon Sep 17 00:00:00 2001
From: "Mallya, Ameya Keshava" <AmeyaKeshava.Mallya@amd.com>
Date: Wed, 12 Mar 2025 10:03:47 -0700
Subject: [PATCH 159/177] Added rocm-ci-caller

---
 .github/workflows/rocm-ci-caller.yml | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)
 create mode 100644 .github/workflows/rocm-ci-caller.yml

diff --git a/.github/workflows/rocm-ci-caller.yml b/.github/workflows/rocm-ci-caller.yml
new file mode 100644
index 0000000000..c18163316b
--- /dev/null
+++ b/.github/workflows/rocm-ci-caller.yml
@@ -0,0 +1,24 @@
+name: ROCm CI Caller
+on:
+  pull_request:
+    branches: [amd-staging, amd-npi, release/rocm-rel-*, amd-mainline]
+    types: [opened, reopened, synchronize]
+  push:
+    branches: [amd-mainline]
+  workflow_dispatch:
+  issue_comment:
+    types: [created]
+
+jobs:
+  call-workflow:
+    if: ${{ github.event_name != 'issue_comment' || github.event.comment.body == '!verify' }}
+    uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@mainline
+    secrets: inherit
+    with:
+      input_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      input_pr_num: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 0 }}
+      input_pr_url: ${{ github.event_name == 'pull_request' && github.event.pull_request.html_url || '' }}
+      input_pr_title: ${{ github.event_name == 'pull_request' && github.event.pull_request.title || '' }}
+      repository_name: ${{ github.repository }}
+      base_ref: ${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}
+      trigger_event_type: ${{ github.event_name }}

From a9b7bc22a865f81b1dc348853d8e6e3c66335898 Mon Sep 17 00:00:00 2001
From: "Mallya, Ameya Keshava" <AmeyaKeshava.Mallya@amd.com>
Date: Wed, 12 Mar 2025 10:11:31 -0700
Subject: [PATCH 160/177] Added KWS check

---
 .github/workflows/kws-caller.yml | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 .github/workflows/kws-caller.yml

diff --git a/.github/workflows/kws-caller.yml b/.github/workflows/kws-caller.yml
new file mode 100644
index 0000000000..ffcbff7b1e
--- /dev/null
+++ b/.github/workflows/kws-caller.yml
@@ -0,0 +1,15 @@
+name: Rocm Validation Suite KWS
+on:
+  push:
+    branches: [amd-staging]
+  pull_request:
+    types: [opened, synchronize, reopened]
+  workflow_dispatch:
+jobs:
+  kws:
+    if: ${{ github.event_name == 'pull_request' }}
+    uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/kws.yml@mainline
+    secrets: inherit
+    with:
+      pr_number: ${{github.event.pull_request.number}}
+      base_branch: ${{github.base_ref}}

From a00c8186c03f8f8025c43e7c5fbb999e00ed38a6 Mon Sep 17 00:00:00 2001
From: "Gupta, Maneesh" <Maneesh.Gupta@amd.com>
Date: Thu, 20 Mar 2025 15:47:52 +0530
Subject: [PATCH 161/177] Update CODEOWNERS (#3)

---
 CODEOWNERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index 917a1d8d9b..754e825e9f 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1 +1,2 @@
-* @chrispaquot @gandryey @saleelk @mangupta @rakesroy
+* @cpaquot_amdeng @gandryey_amdeng @skudchad_amdeng @lmoriche_amdeng
+

From fa53966bc58fce55b79356d2a10f1c46bfbb1999 Mon Sep 17 00:00:00 2001
From: "Mallya, Ameya Keshava" <AmeyaKeshava.Mallya@amd.com>
Date: Fri, 28 Mar 2025 09:22:20 -0700
Subject: [PATCH 162/177] Added KWS check for amd-mainline

---
 .github/workflows/kws-caller.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/kws-caller.yml b/.github/workflows/kws-caller.yml
index ffcbff7b1e..c0f4f26807 100644
--- a/.github/workflows/kws-caller.yml
+++ b/.github/workflows/kws-caller.yml
@@ -1,7 +1,7 @@
 name: Rocm Validation Suite KWS
 on:
   push:
-    branches: [amd-staging]
+    branches: [amd-staging, amd-mainline]
   pull_request:
     types: [opened, synchronize, reopened]
   workflow_dispatch:

From 0b4ce00f8378f74790bf565138df7ab659942ed3 Mon Sep 17 00:00:00 2001
From: "Mallya, Ameya Keshava" <AmeyaKeshava.Mallya@amd.com>
Date: Tue, 1 Apr 2025 09:03:43 -0700
Subject: [PATCH 163/177] !verify functionality

---
 .github/workflows/rocm-ci-caller.yml | 49 ++++++++++++++--------------
 1 file changed, 25 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/rocm-ci-caller.yml b/.github/workflows/rocm-ci-caller.yml
index c18163316b..8387648b05 100644
--- a/.github/workflows/rocm-ci-caller.yml
+++ b/.github/workflows/rocm-ci-caller.yml
@@ -1,24 +1,25 @@
-name: ROCm CI Caller
-on:
-  pull_request:
-    branches: [amd-staging, amd-npi, release/rocm-rel-*, amd-mainline]
-    types: [opened, reopened, synchronize]
-  push:
-    branches: [amd-mainline]
-  workflow_dispatch:
-  issue_comment:
-    types: [created]
-
-jobs:
-  call-workflow:
-    if: ${{ github.event_name != 'issue_comment' || github.event.comment.body == '!verify' }}
-    uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@mainline
-    secrets: inherit
-    with:
-      input_sha: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      input_pr_num: ${{ github.event_name == 'pull_request' && github.event.pull_request.number || 0 }}
-      input_pr_url: ${{ github.event_name == 'pull_request' && github.event.pull_request.html_url || '' }}
-      input_pr_title: ${{ github.event_name == 'pull_request' && github.event.pull_request.title || '' }}
-      repository_name: ${{ github.repository }}
-      base_ref: ${{ github.event_name == 'pull_request' && github.base_ref || github.ref }}
-      trigger_event_type: ${{ github.event_name }}
+name: ROCm CI Caller  
+on:  
+  pull_request:  
+    branches: [amd-staging, amd-npi, release/rocm-rel-*, amd-mainline]  
+    types: [opened, reopened, synchronize]  
+  push:  
+    branches: [amd-mainline]  
+  workflow_dispatch:  
+  issue_comment:  
+    types: [created]  
+  
+jobs:  
+  call-workflow:  
+    if: github.event_name != 'issue_comment' ||(github.event_name == 'issue_comment' && github.event.issue.pull_request && (startsWith(github.event.comment.body, '!verify') || startsWith(github.event.comment.body, '!linux-hip-psdb') || startsWith(github.event.comment.body, '!verify release') || startsWith(github.event.comment.body, '!verify retest')))  
+    uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@verifyfunc
+    secrets: inherit  
+    with:  
+      input_sha: ${{github.event_name == 'pull_request' && github.event.pull_request.head.sha || (github.event_name == 'push' && github.sha) || (github.event_name == 'issue_comment' && github.event.issue.pull_request.head.sha) || github.sha}}  
+      input_pr_num: ${{github.event_name == 'pull_request' && github.event.pull_request.number || (github.event_name == 'issue_comment' && github.event.issue.number) || 0}}  
+      input_pr_url: ${{github.event_name == 'pull_request' && github.event.pull_request.html_url || (github.event_name == 'issue_comment' && github.event.issue.pull_request.html_url) || ''}}  
+      input_pr_title: ${{github.event_name == 'pull_request' && github.event.pull_request.title || (github.event_name == 'issue_comment' && github.event.issue.pull_request.title) || ''}}  
+      repository_name: ${{ github.repository }}  
+      base_ref: ${{github.event_name == 'pull_request' && github.event.pull_request.base.ref || (github.event_name == 'issue_comment' && github.event.issue.pull_request.base.ref) || github.ref}}  
+      trigger_event_type: ${{ github.event_name }}  
+      comment_text: ${{ github.event_name == 'issue_comment' && github.event.comment.body || '' }}

From f9ee9b6eaaf830505f77757bc8ade98afd8b4944 Mon Sep 17 00:00:00 2001
From: "Mallya, Ameya Keshava" <AmeyaKeshava.Mallya@amd.com>
Date: Tue, 1 Apr 2025 09:04:06 -0700
Subject: [PATCH 164/177] Removing NPI

---
 .github/workflows/rocm-ci-caller.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm-ci-caller.yml b/.github/workflows/rocm-ci-caller.yml
index 8387648b05..012b0ddac8 100644
--- a/.github/workflows/rocm-ci-caller.yml
+++ b/.github/workflows/rocm-ci-caller.yml
@@ -1,7 +1,7 @@
 name: ROCm CI Caller  
 on:  
   pull_request:  
-    branches: [amd-staging, amd-npi, release/rocm-rel-*, amd-mainline]  
+    branches: [amd-staging, release/rocm-rel-*, amd-mainline]  
     types: [opened, reopened, synchronize]  
   push:  
     branches: [amd-mainline]  

From 0dbfdc41821c7b7ac6100508edfd545492497440 Mon Sep 17 00:00:00 2001
From: "Mallya, Ameya Keshava" <AmeyaKeshava.Mallya@amd.com>
Date: Tue, 1 Apr 2025 09:50:48 -0700
Subject: [PATCH 165/177] fixed syntax to mainline

---
 .github/workflows/rocm-ci-caller.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/rocm-ci-caller.yml b/.github/workflows/rocm-ci-caller.yml
index 012b0ddac8..182079ec3e 100644
--- a/.github/workflows/rocm-ci-caller.yml
+++ b/.github/workflows/rocm-ci-caller.yml
@@ -12,7 +12,7 @@ on:
 jobs:  
   call-workflow:  
     if: github.event_name != 'issue_comment' ||(github.event_name == 'issue_comment' && github.event.issue.pull_request && (startsWith(github.event.comment.body, '!verify') || startsWith(github.event.comment.body, '!linux-hip-psdb') || startsWith(github.event.comment.body, '!verify release') || startsWith(github.event.comment.body, '!verify retest')))  
-    uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@verifyfunc
+    uses: AMD-ROCm-Internal/rocm_ci_infra/.github/workflows/rocm_ci.yml@mainline
     secrets: inherit  
     with:  
       input_sha: ${{github.event_name == 'pull_request' && github.event.pull_request.head.sha || (github.event_name == 'push' && github.sha) || (github.event_name == 'issue_comment' && github.event.issue.pull_request.head.sha) || github.sha}}  

From 27b12ec7ccab0cf035ee87f1e4fa2a5095745d37 Mon Sep 17 00:00:00 2001
From: "GunaShekar, Ajay" <AJAY.GunaShekar@amd.com>
Date: Wed, 9 Apr 2025 15:39:39 -0700
Subject: [PATCH 166/177] SWDEV-523281 - hipLaunchKernelExC nvidia impl (#5)

---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index bf53844646..05542a01b7 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -1709,6 +1709,15 @@ typedef cudaUserObject_t hipUserObject_t;
 #if CUDA_VERSION >= CUDA_12030
 typedef cudaGraphEdgeData hipGraphEdgeData;
 #endif
+typedef cudaLaunchConfig_t hipLaunchConfig_t;
+typedef cudaLaunchAttribute hipLaunchAttribute;
+typedef CUlaunchAttribute hipDrvLaunchAttribute;
+typedef cudaKernel_t hipKernel_t;
+typedef CUlaunchConfig HIP_LAUNCH_CONFIG;
+typedef CUlaunchAttributeID hipDrvLaunchAttributeID;
+typedef CUlaunchAttributeValue hipDrvLaunchAttributeValue;
+#define hipLaunchAttributeCooperative cudaLaunchAttributeCooperative
+#define hipDrvLaunchAttributeCooperative CU_LAUNCH_ATTRIBUTE_COOPERATIVE
 
 typedef enum cudaGraphNodeType hipGraphNodeType;
 #define hipGraphNodeTypeKernel cudaGraphNodeTypeKernel
@@ -3405,6 +3414,11 @@ inline static hipError_t hipLaunchKernel(const void* function_address, dim3 numB
         cudaLaunchKernel(function_address, numBlocks, dimBlocks, args, sharedMemBytes, stream));
 }
 
+inline static hipError_t hipLaunchKernelExC(const hipLaunchConfig_t* config, const void* func, void** args) {
+    return hipCUDAErrorTohipError(
+        cudaLaunchKernelExC(config, func, args));
+}
+
 inline static hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
                                                unsigned int gridDimY, unsigned int gridDimZ,
                                                unsigned int blockDimX, unsigned int blockDimY,
@@ -3802,6 +3816,17 @@ inline static hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 bloc
             cudaLaunchCooperativeKernel(reinterpret_cast<const void*>(f), gridDim, blockDim, kernelParams, sharedMemBytes, stream));
 }
 
+inline static hipError_t hipDrvLaunchKernelEx(const HIP_LAUNCH_CONFIG* config, hipFunction_t f, void** params, void** extra) {
+    return hipCUResultTohipError(
+        cuLaunchKernelEx(config, f, params, extra));
+}
+
+template <typename... KernelArgs, typename... Params>
+inline static hipError_t hipLaunchKernelEx(const hipLaunchConfig_t* config, void (*kernel)(KernelArgs...), Params&&... args) {
+    return hipCUDAErrorTohipError(
+        cudaLaunchKernelEx(config, kernel, std::forward<Params>(args)...));
+}
+
 inline static hipError_t hipTexObjectCreate(hipTextureObject_t* pTexObject,
                                             const HIP_RESOURCE_DESC* pResDesc,
                                             const HIP_TEXTURE_DESC* pTexDesc,

From 3eb783ecd30b49428d6950c42e6424fe2c1d2492 Mon Sep 17 00:00:00 2001
From: "Hila, Nino" <Nino.Hila@amd.com>
Date: Wed, 23 Apr 2025 16:16:52 -0400
Subject: [PATCH 167/177] Add palamida.yml (#7)

---
 .github/palamida.yml | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .github/palamida.yml

diff --git a/.github/palamida.yml b/.github/palamida.yml
new file mode 100644
index 0000000000..c34f60c67f
--- /dev/null
+++ b/.github/palamida.yml
@@ -0,0 +1,6 @@
+disabled: false
+scmId: gh-emu-rocm
+branchesToScan:
+  - amd-staging
+  - amd-mainline
+jenkinsUrl: https://palamidajenkinsvm.amd.com/job/palamida/job/pci/job/ScanInitiatorGitHub
\ No newline at end of file

From bb6609087515aa8ba31743e3bbc319486f22902a Mon Sep 17 00:00:00 2001
From: "Manocha, Rahul" <rahul.manocha@amd.com>
Date: Thu, 8 May 2025 09:34:41 -0700
Subject: [PATCH 168/177] SWDEV-489106 - Cuda mappings for Linker APIs (#8)

Co-authored-by: rmanocha@amd.com <Dragoslav.Sicarov@amd.com>
---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index 05542a01b7..e65d9e5ee3 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -429,6 +429,44 @@ typedef enum cudaResourceViewFormat hipResourceViewFormat;
 #define HIPRTC_JIT_FAST_COMPILE CU_JIT_FAST_COMPILE
 #define HIPRTC_JIT_NUM_OPTIONS CU_JIT_NUM_OPTIONS
 
+#define hipJitOptionMaxRegisters CU_JIT_MAX_REGISTERS
+#define hipJitOptionThreadsPerBlock CU_JIT_THREADS_PER_BLOCK
+#define hipJitOptionWallTime CU_JIT_WALL_TIME
+#define hipJitOptionInfoLogBuffer CU_JIT_INFO_LOG_BUFFER
+#define hipJitOptionInfoLogBufferSizeBytes CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionErrorLogBuffer CU_JIT_ERROR_LOG_BUFFER
+#define hipJitOptionErrorLogBufferSizeBytes CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES
+#define hipJitOptionOptimizationLevel CU_JIT_OPTIMIZATION_LEVEL
+#define hipJitOptionTargetFromContext CU_JIT_TARGET_FROM_CUCONTEXT
+#define hipJitOptionTarget CU_JIT_TARGET
+#define hipJitOptionFallbackStrategy CU_JIT_FALLBACK_STRATEGY
+#define hipJitOptionGenerateDebugInfo CU_JIT_GENERATE_DEBUG_INFO
+#define hipJitOptionLogVerbose CU_JIT_LOG_VERBOSE
+#define hipJitOptionGenerateLineInfo CU_JIT_GENERATE_LINE_INFO
+#define hipJitOptionCacheMode CU_JIT_CACHE_MODE
+#define hipJitOptionSm3xOpt CU_JIT_NEW_SM3X_OPT
+#define hipJitOptionFastCompile CU_JIT_FAST_COMPILE
+#define hipJitOptionGlobalSymbolNames CU_JIT_GLOBAL_SYMBOL_NAMES
+#define hipJitOptionGlobalSymbolAddresses CU_JIT_GLOBAL_SYMBOL_ADDRESSES
+#define hipJitOptionGlobalSymbolCount CU_JIT_GLOBAL_SYMBOL_COUNT
+#define hipJitOptionLto CU_JIT_LTO
+#define hipJitOptionFtz CU_JIT_FTZ
+#define hipJitOptionPrecDiv CU_JIT_PREC_DIV
+#define hipJitOptionPrecSqrt CU_JIT_PREC_SQRT
+#define hipJitOptionFma CU_JIT_FMA
+#define hipJitOptionPositionIndependentCode CU_JIT_POSITION_INDEPENDENT_CODE
+#define hipJitOptionMinCTAPerSM CU_JIT_MIN_CTA_PER_SM
+#define hipJitOptionMaxThreadsPerBlock CU_JIT_MAX_THREADS_PER_BLOCK
+#define hipJitOptionOverrideDirectiveValues CU_JIT_OVERRIDE_DIRECTIVE_VALUES
+#define hipJitOptionNumOptions CU_JIT_NUM_OPTIONS
+#define hipJitInputCubin CU_JIT_INPUT_CUBIN
+#define hipJitInputPtx CU_JIT_INPUT_PTX
+#define hipJitInputFatBinary CU_JIT_INPUT_FATBINARY
+#define hipJitInputObject CU_JIT_INPUT_OBJECT
+#define hipJitInputLibrary CU_JIT_INPUT_LIBRARY
+#define hipJitInputNvvm CU_JIT_INPUT_NVVM
+#define hipJitNumInputTypes CU_JIT_NUM_INPUT_TYPES
+
 typedef cudaEvent_t hipEvent_t;
 typedef cudaStream_t hipStream_t;
 typedef cudaIpcEventHandle_t hipIpcEventHandle_t;
@@ -439,6 +477,7 @@ typedef enum cudaFuncCache hipFuncCache_t;
 typedef CUcontext hipCtx_t;
 typedef enum cudaSharedMemConfig hipSharedMemConfig;
 typedef CUfunc_cache hipFuncCache;
+typedef CUjitInputType hipJitInputType;
 typedef CUjit_option hipJitOption;
 typedef CUdevice hipDevice_t;
 typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr;
@@ -449,6 +488,7 @@ typedef enum cudaDeviceP2PAttr hipDeviceP2PAttr;
 #define hipFuncAttributeMaxDynamicSharedMemorySize cudaFuncAttributeMaxDynamicSharedMemorySize
 #define hipFuncAttributePreferredSharedMemoryCarveout cudaFuncAttributePreferredSharedMemoryCarveout
 
+typedef CUlinkState hipLinkState_t;
 typedef CUmodule hipModule_t;
 typedef CUfunction hipFunction_t;
 typedef CUdeviceptr hipDeviceptr_t;
@@ -3360,6 +3400,29 @@ inline static hipError_t hipDeviceTotalMem(size_t* bytes, hipDevice_t device) {
     return hipCUResultTohipError(cuDeviceTotalMem(bytes, device));
 }
 
+inline static hipError_t hipLinkAddData(hipLinkState_t state, hipJitInputType type, void* data,
+                                        size_t size, const char* name, unsigned int numOptions,
+                                        hipJitOption* options, void** optionValues) {
+    return hipCUResultTohipError(
+    cuLinkAddData(state, type, data, size, name, numOptions, options, optionValues));
+}
+inline static hipError_t hipLinkAddFile(hipLinkState_t state, hipJitInputType type,
+                                        const char* path, unsigned int numOptions,
+                                        hipJitOption* options, void** optionValues) {
+    return hipCUResultTohipError(
+    cuLinkAddFile(state, type, path, numOptions, options, optionValues));
+}
+inline static hipError_t hipLinkComplete(hipLinkState_t state, void** hipBinOut, size_t* sizeOut) {
+    return hipCUResultTohipError(cuLinkComplete(state, hipBinOut, sizeOut));
+}
+inline static hipError_t hipLinkCreate(unsigned int numOptions, hipJitOption* options,
+                                       void** optionValues, hipLinkState_t* stateOut) {
+    return hipCUResultTohipError(cuLinkCreate(numOptions, options, optionValues, stateOut));
+}
+inline static hipError_t hipLinkDestroy(hipLinkState_t state) {
+    return hipCUResultTohipError(cuLinkDestroy(state));
+}
+
 inline static hipError_t hipModuleLoad(hipModule_t* module, const char* fname) {
     return hipCUResultTohipError(cuModuleLoad(module, fname));
 }

From e0ff75fac63e77bb74cb3f0c406a8490cad47056 Mon Sep 17 00:00:00 2001
From: "Hila, Nino" <Nino.Hila@amd.com>
Date: Tue, 13 May 2025 00:42:01 -0400
Subject: [PATCH 169/177] Update palamida.yml (#9)

* Add palamida.yml - removing url
---
 .github/palamida.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/palamida.yml b/.github/palamida.yml
index c34f60c67f..47bd57a5ab 100644
--- a/.github/palamida.yml
+++ b/.github/palamida.yml
@@ -2,5 +2,4 @@ disabled: false
 scmId: gh-emu-rocm
 branchesToScan:
   - amd-staging
-  - amd-mainline
-jenkinsUrl: https://palamidajenkinsvm.amd.com/job/palamida/job/pci/job/ScanInitiatorGitHub
\ No newline at end of file
+  - amd-mainline
\ No newline at end of file

From ed4a99ba5c111269b86c3ceb69d91c582354522a Mon Sep 17 00:00:00 2001
From: "Assiouras, Ioannis" <Ioannis.Assiouras@amd.com>
Date: Wed, 21 May 2025 16:55:17 +0100
Subject: [PATCH 170/177] SWDEV-508965 - [6.4 Preview] Remove
 HIP_MEMSET_NODE_PARAMS struct (#11)

---
 .../nvidia_detail/nvidia_hip_runtime_api.h    | 30 ++++++++++++++-----
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index e65d9e5ee3..d9774dddfb 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -927,7 +927,6 @@ typedef struct cudaTextureDesc hipTextureDesc;
 typedef struct cudaResourceViewDesc hipResourceViewDesc;
 typedef CUDA_TEXTURE_DESC HIP_TEXTURE_DESC;
 typedef CUDA_RESOURCE_VIEW_DESC HIP_RESOURCE_VIEW_DESC;
-typedef CUDA_MEMSET_NODE_PARAMS HIP_MEMSET_NODE_PARAMS;
 // adding code for hipmemSharedConfig
 #define hipSharedMemBankSizeDefault cudaSharedMemBankSizeDefault
 #define hipSharedMemBankSizeFourByte cudaSharedMemBankSizeFourByte
@@ -4697,11 +4696,24 @@ inline static hipError_t hipGraphNodeGetEnabled(hipGraphExec_t hGraphExec, hipGr
     return hipCUDAErrorTohipError(cudaGraphNodeGetEnabled(hGraphExec, hNode, isEnabled));
 }
 
+inline static void hipMemsetParamsToCUDAMemsetNodeParams(CUDA_MEMSET_NODE_PARAMS *cuMemsetParams,
+                                                         const hipMemsetParams *memsetParams)
+{
+    cuMemsetParams->dst = reinterpret_cast<CUdeviceptr>(memsetParams->dst);
+    cuMemsetParams->elementSize = memsetParams->elementSize;
+    cuMemsetParams->height = memsetParams->height;
+    cuMemsetParams->pitch = memsetParams->pitch;
+    cuMemsetParams->value = memsetParams->value;
+    cuMemsetParams->width = memsetParams->width;
+}
+
 inline static hipError_t hipDrvGraphAddMemsetNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
-                                 const hipGraphNode_t* dependencies, size_t numDependencies,
-                                 const HIP_MEMSET_NODE_PARAMS* memsetParams, hipCtx_t ctx) {
+                                const hipGraphNode_t* dependencies, size_t numDependencies,
+                                const hipMemsetParams* memsetParams, hipCtx_t ctx) {
+    CUDA_MEMSET_NODE_PARAMS cuMemsetParams;
+    hipMemsetParamsToCUDAMemsetNodeParams(&cuMemsetParams, memsetParams);
     return hipCUResultTohipError(cuGraphAddMemsetNode(phGraphNode, hGraph, dependencies, numDependencies,
-                                    memsetParams, ctx));
+                                                      &cuMemsetParams, ctx));
 }
 
 inline static hipError_t hipDrvGraphAddMemcpyNode(hipGraphNode_t* phGraphNode, hipGraph_t hGraph,
@@ -4760,11 +4772,13 @@ inline static hipError_t hipDrvGraphExecMemcpyNodeSetParams(hipGraphExec_t hGrap
 }
 
 inline static hipError_t hipDrvGraphExecMemsetNodeSetParams(
-    hipGraphExec_t hGraphExec, hipGraphNode_t hNode, const HIP_MEMSET_NODE_PARAMS* memsetParams,
+    hipGraphExec_t hGraphExec, hipGraphNode_t hNode, const hipMemsetParams* memsetParams,
     hipCtx_t ctx) {
-  return hipCUResultTohipError(
-      cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, memsetParams, ctx));
-}
+    CUDA_MEMSET_NODE_PARAMS cuMemsetParams;
+    hipMemsetParamsToCUDAMemsetNodeParams(&cuMemsetParams, memsetParams);
+    return hipCUResultTohipError(
+        cuGraphExecMemsetNodeSetParams(hGraphExec, hNode, &cuMemsetParams, ctx));
+  }
 #endif
 
 #if CUDA_VERSION >= CUDA_11040

From 0e51a3b5c19ee03202d8a671c96eb0edba1d40c2 Mon Sep 17 00:00:00 2001
From: "Gupta, Maneesh" <Maneesh.Gupta@amd.com>
Date: Thu, 22 May 2025 15:09:21 +0530
Subject: [PATCH 171/177] Create pull_request_template.md (#12)

Initial draft of PR template
---
 .github/pull_request_template.md | 36 ++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)
 create mode 100644 .github/pull_request_template.md

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
new file mode 100644
index 0000000000..3585d2a02f
--- /dev/null
+++ b/.github/pull_request_template.md
@@ -0,0 +1,36 @@
+## Associated JIRA ticket number/Github issue number
+<!-- For example: "Closes #1234" or "Fixes SWDEV-123456" -->
+
+## What type of PR is this? (check all applicable)
+
+- [ ] Refactor
+- [ ] Feature
+- [ ] Bug Fix
+- [ ] Optimization
+- [ ] Documentation Update
+- [ ] Continuous Integration
+
+## What were the changes?
+
+<!-- Please give a short summary of the change. -->
+
+## Why are these changes needed?
+
+<!-- Please explain the motivation behind the change and why this solves the given problem. -->
+
+## Updated CHANGELOG?
+
+<!-- Needed for Release updates for a ROCm release. -->
+
+- [ ] Yes
+- [ ] No, Does not apply to this PR.
+
+## Added/Updated documentation?
+
+- [ ] Yes
+- [ ] No, Does not apply to this PR.
+
+## Additional Checks
+
+- [ ] I have added tests relevant to the introduced functionality, and the unit tests are passing locally.
+- [ ] Any dependent changes have been merged.

From 67eb0874e97aafe23907561f8674d39db9cdb94e Mon Sep 17 00:00:00 2001
From: "Lytovchenko, Danylo" <Danylo.Lytovchenko@amd.com>
Date: Wed, 28 May 2025 10:10:24 +0200
Subject: [PATCH 172/177] SWDEV-123456 - add validation workflows (#13)

---
 .github/scripts/validate_pr_description.py    | 76 +++++++++++++++++++
 .github/workflows/keyword-check.yml           | 73 ++++++++++++++++++
 .github/workflows/pr-title-validate.yml       | 42 ++++++++++
 .github/workflows/validate-pr-description.yml | 22 ++++++
 4 files changed, 213 insertions(+)
 create mode 100644 .github/scripts/validate_pr_description.py
 create mode 100644 .github/workflows/keyword-check.yml
 create mode 100644 .github/workflows/pr-title-validate.yml
 create mode 100644 .github/workflows/validate-pr-description.yml

diff --git a/.github/scripts/validate_pr_description.py b/.github/scripts/validate_pr_description.py
new file mode 100644
index 0000000000..eb282acffd
--- /dev/null
+++ b/.github/scripts/validate_pr_description.py
@@ -0,0 +1,76 @@
+import os, re, sys
+from typing import List, Optional
+
+
+def is_checkbox(line: str) -> bool:
+    return bool(re.match(r"^\s*-\s*\[[ xX]\]\s*.+", line))
+
+
+def is_checked(line: str) -> bool:
+    return bool(re.match(r"^\s*-\s*\[[xX]\]\s*.+", line))
+
+
+def is_comment(line: str) -> bool:
+    return bool(re.match(r"^\s*<!--.*-->\s*$", line))
+
+
+def text_clean(lines: List[str]) -> str:
+    text = [line for line in lines if not is_comment(line)]
+    return "".join("".join(text).strip().split())
+
+
+def validate_section(section_name: str, lines: List[str]) -> Optional[str]:
+    has_checkboxes = any(is_checkbox(line) for line in lines)
+    if has_checkboxes:
+        if not any(is_checked(line) for line in lines):
+            return f"Section {section_name} is a checklist without selections"
+        return None
+    if not text_clean(lines):
+        return f"Section {section_name} is empty text section"
+    return None
+
+
+def check_description(description: str) -> List[str]:
+    if not description:
+        # pull_request_template is not merged yet, so treat as valid for now
+        return []
+        # return ["PR description is empty"]
+
+    sections = []
+    current_section = None
+    current_lines = []
+    errors = []
+
+    for line in description.splitlines():
+        header_match = re.match(r"^\s*##\s*(.+?)\s*$", line)
+        if header_match:
+            if current_section:
+                sections.append((current_section, current_lines))
+            current_section = header_match.group(1)
+            current_lines = []
+        elif current_section:
+            current_lines.append(line)
+
+    if current_section:
+        sections.append((current_section, current_lines))
+
+    if not sections:
+        return ["No sections available, template is empty"]
+
+    for section_name, section_lines in sections:
+        error = validate_section(section_name, section_lines)
+        if error:
+            errors.append(error)
+
+    return errors
+
+
+if __name__ == "__main__":
+    pr_description = os.getenv("PR_DESCRIPTION", "")
+
+    errors = check_description(pr_description)
+    if not errors:
+        print("All good")
+        exit(0)
+    print("\n".join(errors))
+    exit(1)
diff --git a/.github/workflows/keyword-check.yml b/.github/workflows/keyword-check.yml
new file mode 100644
index 0000000000..7177d1a669
--- /dev/null
+++ b/.github/workflows/keyword-check.yml
@@ -0,0 +1,73 @@
+name: Keywords checker
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened, edited]
+    branches:
+      - amd-staging
+  workflow_dispatch:
+
+jobs:
+  check-keywords:
+    runs-on: ubuntu-latest
+    env:
+      KEYWORDS: ${{ vars.KEYWORDS }}
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 0
+
+      - name: Check keywords
+        run: |
+          set -e
+
+          if [ -z "$KEYWORDS" ]; then
+            echo "No keywords set. Skipping check"
+            exit 0
+          fi
+
+          IFS=',' read -ra KEYWORDS_ARRAY <<< "$KEYWORDS"
+          echo "Checking against list of keywords: ${KEYWORDS_ARRAY[*]}"
+
+          MATCHED=0
+          BASE_BRANCH=${{github.event.pull_request.base.ref}}
+          HEAD_BRANCH=${{github.event.pull_request.head.ref}}
+          PR_TITLE="${{ github.event.pull_request.title }}"
+
+          for file in $(git diff --name-only origin/$BASE_BRANCH..origin/$HEAD_BRANCH); do
+            if [ -f "$file" ]; then
+              for keyword in "${KEYWORDS_ARRAY[*]}"; do
+                grep -in -E "${keyword}" "$file" | while IFS= read -r line; do
+                  echo "Matched in '$file': $line"
+                  MATCHED=1
+                done
+              done
+            fi
+          done
+
+          for commit in $(git log --format=%H origin/$BASE_BRANCH..origin/$HEAD_BRANCH); do
+            msg=$(git log -1 --format=%B "$commit")
+            for keyword in "${KEYWORDS_ARRAY[*]}"; do
+              if echo "$msg" | grep -i -q "$keyword"; then
+                echo "Match in commit $commit: $msg"
+                MATCHED=1
+              fi
+            done
+          done
+
+          for keyword in "${KEYWORDS_ARRAY[*]}"; do
+            if echo "$PR_TITLE" | grep -i -q "$keyword"; then
+              echo "Match in PR title"
+              MATCHED=1
+            fi
+          done
+          
+          if [ "$MATCHED" -eq 1 ]; then
+            echo "Keywords found, please see diagnostics higher"
+            exit 1
+          else
+            echo "No keywords found"
+            exit 0
+          fi
diff --git a/.github/workflows/pr-title-validate.yml b/.github/workflows/pr-title-validate.yml
new file mode 100644
index 0000000000..65f5564d81
--- /dev/null
+++ b/.github/workflows/pr-title-validate.yml
@@ -0,0 +1,42 @@
+name: Validate PR Title
+
+on:
+  pull_request:
+    types: [opened, edited, synchronize, reopened]
+
+jobs:
+  validate-pr-title:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check PR Title
+        id: check-pr-title
+        run: |
+          PR_TITLE="${{ github.event.pull_request.title }}"
+
+          if [[ ! "$PR_TITLE" =~ ^SWDEV-[0-9]+ ]]; then
+            echo "::error::PR title must start with a Jira ticket ID, SWDEV-<num>"
+            exit 1
+          else
+            echo "PR title is valid"
+          fi
+
+  validate-commit-messages:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+        with:
+            fetch-depth: 0
+
+      - name: Check all commit messages
+        id: validate-commit-messags
+        run: |
+          COMMITS=$(git log --format="%H %s" origin/${{ github.event.pull_request.base.ref }}..origin/${{ github.event.pull_request.head.ref }})
+          echo "$COMMITS"
+          echo "$COMMITS" | while read -r hash message; do
+            echo -e "$hash $message\n "
+            if [[ ! "$message" =~ ^SWDEV-[0-9]+ ]]; then
+              echo "::error:: $hash commit should start with Jira ticket ID, SWDEV-<num>"
+              exit 1
+            fi
+          done
diff --git a/.github/workflows/validate-pr-description.yml b/.github/workflows/validate-pr-description.yml
new file mode 100644
index 0000000000..db69802d81
--- /dev/null
+++ b/.github/workflows/validate-pr-description.yml
@@ -0,0 +1,22 @@
+name: Validate PR desription
+
+on:
+  pull_request:
+    types: [opened, edited, synchronize]
+
+jobs:
+  validate-pr-description:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.13"
+
+      - name: Validate PR description
+        env:
+          PR_DESCRIPTION: ${{ github.event.pull_request.body }}
+        run: python .github/scripts/validate_pr_description.py

From c04516b483b6c3a8362a029984050940432e7f9e Mon Sep 17 00:00:00 2001
From: "Lytovchenko, Danylo" <Danylo.Lytovchenko@amd.com>
Date: Thu, 5 Jun 2025 14:55:57 +0200
Subject: [PATCH 173/177] SWDEV-123456 - add automatic clang-formatting (#15)

---
 .github/hooks/clang-format-check.sh | 39 +++++++++++++++++++++++++++++
 .github/hooks/pre-commit            |  2 ++
 .github/workflows/clang-format.yml  | 22 ++++++++++++++++
 3 files changed, 63 insertions(+)
 create mode 100644 .github/hooks/clang-format-check.sh
 create mode 100644 .github/hooks/pre-commit
 create mode 100644 .github/workflows/clang-format.yml

diff --git a/.github/hooks/clang-format-check.sh b/.github/hooks/clang-format-check.sh
new file mode 100644
index 0000000000..a9f4f25f79
--- /dev/null
+++ b/.github/hooks/clang-format-check.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+RANGE=""
+
+while [[ $# -gt 0 ]]; do
+  echo $1
+  echo $2
+  case "$1" in
+  --range)
+    RANGE="$2"
+    shift 2
+    ;;
+  *)
+    echo "Unknown arg $1" >&2
+    exit 64
+    ;;
+  esac
+done
+
+regex='\.(c|cc|cpp|cxx|h|hh|hpp|hxx)$'
+
+if [[ -n $RANGE ]]; then
+  files=$(git diff --name-only "$RANGE" | grep -E "$regex" || true)
+else
+  files=$(git diff --cached --name-only --diff-filter=ACMR | grep -E "$regex" || true)
+fi
+echo "Checking $files"
+[[ -z $files ]] && exit 0
+
+clang_bin="${CLANG_FORMAT:-clang-format}"
+if ! command -v "$clang_bin" >/dev/null 2>&1; then
+  if [[ -x "/c/Program Files/LLVM/bin/clang-format.exe" ]]; then
+    clang_bin="/c/Program Files/LLVM/bin/clang-format.exe"
+  fi
+fi
+
+"$clang_bin" -style=file --dry-run -fallback-style=none -n -Werror $files
diff --git a/.github/hooks/pre-commit b/.github/hooks/pre-commit
new file mode 100644
index 0000000000..f42d5a3174
--- /dev/null
+++ b/.github/hooks/pre-commit
@@ -0,0 +1,2 @@
+#!/usr/bin/env bash
+exec "$(git rev-parse --show-toplevel)/.github/hooks/clang-format-check.sh"
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
new file mode 100644
index 0000000000..b17babaa2f
--- /dev/null
+++ b/.github/workflows/clang-format.yml
@@ -0,0 +1,22 @@
+name: Clang format check
+
+on:
+  push:
+
+jobs:
+  format:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install clang-format
+        run: |
+          sudo apt update && sudo apt install -y clang-format
+
+      - name: Run clang-format-check
+        id: clang-format
+        run: |
+          chmod +x .github/hooks/clang-format-check.sh
+          ./.github/hooks/clang-format-check.sh --range "${{ github.event.before }}..${{ github.sha }}"

From 2366ff9f333932a5322bfc6d819c17fca28de660 Mon Sep 17 00:00:00 2001
From: "Lytovchenko, Danylo" <Danylo.Lytovchenko@amd.com>
Date: Tue, 24 Jun 2025 11:10:27 +0200
Subject: [PATCH 174/177] SWDEV-123456 - fix runners type (#17)

---
 .github/workflows/clang-format.yml            | 2 +-
 .github/workflows/keyword-check.yml           | 2 +-
 .github/workflows/pr-title-validate.yml       | 2 +-
 .github/workflows/validate-pr-description.yml | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index b17babaa2f..7e37ebbe97 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -5,7 +5,7 @@ on:
 
 jobs:
   format:
-    runs-on: ubuntu-latest
+    runs-on: hip-clr-dev1
     steps:
       - uses: actions/checkout@v4
         with:
diff --git a/.github/workflows/keyword-check.yml b/.github/workflows/keyword-check.yml
index 7177d1a669..b177ae4bff 100644
--- a/.github/workflows/keyword-check.yml
+++ b/.github/workflows/keyword-check.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   check-keywords:
-    runs-on: ubuntu-latest
+    runs-on: hip-clr-dev1
     env:
       KEYWORDS: ${{ vars.KEYWORDS }}
 
diff --git a/.github/workflows/pr-title-validate.yml b/.github/workflows/pr-title-validate.yml
index 65f5564d81..c6ffbd28b2 100644
--- a/.github/workflows/pr-title-validate.yml
+++ b/.github/workflows/pr-title-validate.yml
@@ -21,7 +21,7 @@ jobs:
           fi
 
   validate-commit-messages:
-    runs-on: ubuntu-latest
+    runs-on: hip-clr-dev1
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
diff --git a/.github/workflows/validate-pr-description.yml b/.github/workflows/validate-pr-description.yml
index db69802d81..63c0fb87e7 100644
--- a/.github/workflows/validate-pr-description.yml
+++ b/.github/workflows/validate-pr-description.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   validate-pr-description:
-    runs-on: ubuntu-latest
+    runs-on: hip-clr-dev1
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4

From 1992d81122f6c9ce4dd2f18e9ef394af49678c1e Mon Sep 17 00:00:00 2001
From: "Lytovchenko, Danylo" <Danylo.Lytovchenko@amd.com>
Date: Thu, 26 Jun 2025 12:15:39 +0200
Subject: [PATCH 175/177] SWDEV-123456 - fix runners names (#18)

---
 .github/workflows/clang-format.yml            | 10 +++++-----
 .github/workflows/keyword-check.yml           |  2 +-
 .github/workflows/pr-title-validate.yml       | 10 +++++++---
 .github/workflows/validate-pr-description.yml |  2 +-
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
index 7e37ebbe97..0298b5fc76 100644
--- a/.github/workflows/clang-format.yml
+++ b/.github/workflows/clang-format.yml
@@ -1,11 +1,11 @@
-name: Clang format check
-
+name: Clang format check  
 on:
-  push:
+  pull_request:
+    types: [synchronize, opened]
 
 jobs:
   format:
-    runs-on: hip-clr-dev1
+    runs-on: AMD-ROCm-Internal-dev1
     steps:
       - uses: actions/checkout@v4
         with:
@@ -19,4 +19,4 @@ jobs:
         id: clang-format
         run: |
           chmod +x .github/hooks/clang-format-check.sh
-          ./.github/hooks/clang-format-check.sh --range "${{ github.event.before }}..${{ github.sha }}"
+          ./.github/hooks/clang-format-check.sh --range "${{ github.event.pull_request.base.sha }}..${{ github.event.pull_request.head.sha }}"
diff --git a/.github/workflows/keyword-check.yml b/.github/workflows/keyword-check.yml
index b177ae4bff..12108cee14 100644
--- a/.github/workflows/keyword-check.yml
+++ b/.github/workflows/keyword-check.yml
@@ -9,7 +9,7 @@ on:
 
 jobs:
   check-keywords:
-    runs-on: hip-clr-dev1
+    runs-on: AMD-ROCm-Internal-dev1
     env:
       KEYWORDS: ${{ vars.KEYWORDS }}
 
diff --git a/.github/workflows/pr-title-validate.yml b/.github/workflows/pr-title-validate.yml
index c6ffbd28b2..f68440d948 100644
--- a/.github/workflows/pr-title-validate.yml
+++ b/.github/workflows/pr-title-validate.yml
@@ -21,7 +21,7 @@ jobs:
           fi
 
   validate-commit-messages:
-    runs-on: hip-clr-dev1
+    runs-on: AMD-ROCm-Internal-dev1
     steps:
       - name: Checkout code
         uses: actions/checkout@v3
@@ -35,8 +35,12 @@ jobs:
           echo "$COMMITS"
           echo "$COMMITS" | while read -r hash message; do
             echo -e "$hash $message\n "
-            if [[ ! "$message" =~ ^SWDEV-[0-9]+ ]]; then
-              echo "::error:: $hash commit should start with Jira ticket ID, SWDEV-<num>"
+            if [[ "$message" =~ ^SWDEV-[0-9]+ ]]; then
+              echo "Valid JIRA ticket format"
+            elif [[ "$message" =~ ^Merge\ branch ]]; then
+              echo "Merge commits are allowed"
+            else
+              echo "::error:: $hash commit should start with Jira ticket ID, SWDEV-<num> or be a merge commit"
               exit 1
             fi
           done
diff --git a/.github/workflows/validate-pr-description.yml b/.github/workflows/validate-pr-description.yml
index 63c0fb87e7..d9b12b4ba6 100644
--- a/.github/workflows/validate-pr-description.yml
+++ b/.github/workflows/validate-pr-description.yml
@@ -6,7 +6,7 @@ on:
 
 jobs:
   validate-pr-description:
-    runs-on: hip-clr-dev1
+    runs-on: AMD-ROCm-Internal-dev1
     steps:
       - name: Checkout repository
         uses: actions/checkout@v4

From d9ec8a09771569a320940b4d4e5964b19fc99d05 Mon Sep 17 00:00:00 2001
From: "Lytovchenko, Danylo" <Danylo.Lytovchenko@amd.com>
Date: Fri, 27 Jun 2025 17:47:11 +0200
Subject: [PATCH 176/177] SWDEV-123456 - add correct line-by-line clang format
 script (#19)

---
 .github/hooks/clang-format-check.sh | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/.github/hooks/clang-format-check.sh b/.github/hooks/clang-format-check.sh
index a9f4f25f79..e417133217 100644
--- a/.github/hooks/clang-format-check.sh
+++ b/.github/hooks/clang-format-check.sh
@@ -36,4 +36,21 @@ if ! command -v "$clang_bin" >/dev/null 2>&1; then
   fi
 fi
 
-"$clang_bin" -style=file --dry-run -fallback-style=none -n -Werror $files
+clang_format_diff="${CLANG_FORMAT_DIFF:-clang-format-diff}"
+if ! command -v "$clang_format_diff" >/dev/null 2>&1; then
+  if [[ -x "/c/Program Files/LLVM/share/clang/clang-format-diff.py" ]]; then
+    clang_format_diff="/c/Program Files/LLVM/share/clang/clang-format-diff.py"
+  fi
+fi
+
+for file in $files; do
+  echo "Checking lines of $file"
+
+  if [[ -n $RANGE ]]; then
+    diff_output=$(git diff -U0 "$RANGE" -- "$file")
+  else
+    diff_output=$(git diff -U0 --cached -- "$file")
+  fi
+
+  echo "$diff_output" | "$clang_format_diff" -style=file -fallback-style=none -p1
+done

From 2d5e4bd3fed07aea4f8503a13dc2ca8d2f0e1917 Mon Sep 17 00:00:00 2001
From: Robin Voetter <robin@streamhpc.com>
Date: Tue, 8 Jul 2025 11:10:45 +0200
Subject: [PATCH 177/177] fix typo in hipBatchMemOpParamsTocudaBatchMemOpParams

---
 hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
index d9774dddfb..7a3c37076b 100644
--- a/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
+++ b/hipnv/include/hip/nvidia_detail/nvidia_hip_runtime_api.h
@@ -665,7 +665,7 @@ static inline void hipBatchMemOpParamsTocudaBatchMemOpParams(CUstreamBatchMemOpP
             a[i].writeValue.alias = (CUdeviceptr)(p[i].writeValue.alias);
         }
         else if (p[i].memoryBarrier.operation == hipStreamMemOpBarrier) {
-            a[i].memoryBarrier.operation == CU_STREAM_MEM_OP_BARRIER;
+            a[i].memoryBarrier.operation = CU_STREAM_MEM_OP_BARRIER;
             a[i].memoryBarrier.flags = p[i].memoryBarrier.flags;
         }
         else if (p[i].flushRemoteWrites.operation == hipStreamMemOpFlushRemoteWrites) {