From 5fb09879c7b2f8fe1b0353947769fb69e14cd6cf Mon Sep 17 00:00:00 2001
From: Rahul Garg <rahul.garg@amd.com>
Date: Mon, 9 Jan 2017 23:54:01 +0530
Subject: [PATCH 01/18] Added state for hipDevice.

Change-Id: Idbc3c04cd054a01b634856a1e0a23ff172e991aa
---
 include/hip/hip_runtime_api.h |  1 +
 src/hip_device.cpp            | 64 ++++++++++++++++++++++++-----------
 src/hip_hcc.cpp               |  5 ++-
 src/hip_hcc.h                 |  6 +++-
 src/hip_memory.cpp            |  8 ++---
 5 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index a45a1ee27e..a2bfed5c69 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -181,6 +181,7 @@ typedef enum hipError_t {
     hipErrorSharedObjectSymbolNotFound = 302,
     hipErrorSharedObjectInitFailed  = 303,
     hipErrorOperatingSystem         = 304,
+    hipErrorSetOnActiveProcess      = 305,
     hipErrorInvalidHandle           = 400,
     hipErrorNotFound                = 500,
     hipErrorIllegalAddress          = 700,
diff --git a/src/hip_device.cpp b/src/hip_device.cpp
index 1cfdaa619d..0f2c2e2753 100644
--- a/src/hip_device.cpp
+++ b/src/hip_device.cpp
@@ -175,6 +175,24 @@ hipError_t hipDeviceReset(void)
     return ihipLogStatus(hipSuccess);
 }
 
+hipError_t ihipDeviceSetState(void)
+{
+    hipError_t e = hipErrorInvalidContext;
+    auto *ctx = ihipGetTlsDefaultCtx();
+
+    if (ctx) {
+        ihipDevice_t *deviceHandle = ctx->getWriteableDevice();
+        if(deviceHandle->_state == 0)
+        {
+            deviceHandle->_state = 1;
+        }
+        e = hipSuccess;
+    }
+
+    return ihipLogStatus(e);
+}
+
+
 hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device)
 {
     hipError_t e = hipSuccess;
@@ -289,29 +307,35 @@ hipError_t hipSetDeviceFlags( unsigned int flags)
     // TODO : does this really OR in the flags or replaces previous flags:
     // TODO : Review error handling behavior for this function, it often returns ErrorSetOnActiveProcess
     if (ctx) {
-       ctx->_ctxFlags = ctx->_ctxFlags | flags;
-       if (flags & hipDeviceScheduleMask) {
-           switch (hipDeviceScheduleMask) {
-              case hipDeviceScheduleAuto:
-              case hipDeviceScheduleSpin:
-              case hipDeviceScheduleYield:
-              case hipDeviceScheduleBlockingSync:
-                   e = hipSuccess;
-                   break;
-               default:
-                   e = hipSuccess; // TODO - should this be error?  Map to Auto?
-                   //e = hipErrorInvalidValue;
-                   break;
+       auto *deviceHandle = ctx->getDevice();
+       if(deviceHandle->_state == 0)
+       {
+           ctx->_ctxFlags = ctx->_ctxFlags | flags;
+           if (flags & hipDeviceScheduleMask) {
+               switch (hipDeviceScheduleMask) {
+                  case hipDeviceScheduleAuto:
+                  case hipDeviceScheduleSpin:
+                  case hipDeviceScheduleYield:
+                  case hipDeviceScheduleBlockingSync:
+                       e = hipSuccess;
+                       break;
+                  default:
+                       e = hipSuccess; // TODO - should this be error?  Map to Auto?
+                       //e = hipErrorInvalidValue;
+                       break;
+               }
            }
-       }
 
-       unsigned supportedFlags = hipDeviceScheduleMask | hipDeviceMapHost | hipDeviceLmemResizeToMax;
+           unsigned supportedFlags = hipDeviceScheduleMask | hipDeviceMapHost | hipDeviceLmemResizeToMax;
 
-       if (flags & (~supportedFlags)) {
-          e = hipErrorInvalidValue;
-       }
-    } else {
-       e = hipErrorInvalidDevice;
+           if (flags & (~supportedFlags)) {
+              e = hipErrorInvalidValue;
+           }
+        } else {
+              e = hipErrorSetOnActiveProcess;
+        }
+        } else {
+           e = hipErrorInvalidDevice;
     }
 
     return ihipLogStatus(e);
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index a4ef2b392b..d760ade15d 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -482,7 +482,8 @@ void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
 //=================================================================================================
 ihipDevice_t::ihipDevice_t(unsigned deviceId, unsigned deviceCnt, hc::accelerator &acc) :
     _deviceId(deviceId),
-    _acc(acc)
+    _acc(acc),
+    _state(0)
 {
     hsa_agent_t *agent = static_cast<hsa_agent_t*> (acc.get_hsa_agent());
     if (agent) {
@@ -865,6 +866,7 @@ void ihipCtx_t::locked_reset()
     // Reset will remove peer mapping so don't need to do this explicitly.
     // FIXME - This is clearly a non-const action!  Is this a context reset or a device reset - maybe should reference count?
     ihipDevice_t *device = getWriteableDevice();
+    device->_state = 0;
     am_memtracker_reset(device->_acc);
 
 };
@@ -1553,6 +1555,7 @@ const char *ihipErrorString(hipError_t hip_error)
         case hipErrorSharedObjectSymbolNotFound : return "hipErrorSharedObjectSymbolNotFound";
         case hipErrorSharedObjectInitFailed     : return "hipErrorSharedObjectInitFailed";
         case hipErrorOperatingSystem            : return "hipErrorOperatingSystem";
+        case hipErrorSetOnActiveProcess         : return "hipErrorSetOnActiveProcess";
         case hipErrorInvalidHandle              : return "hipErrorInvalidHandle";
         case hipErrorNotFound                   : return "hipErrorNotFound";
         case hipErrorIllegalAddress             : return "hipErrorIllegalAddress";
diff --git a/src/hip_hcc.h b/src/hip_hcc.h
index ed85f1494c..8a4d457cb1 100644
--- a/src/hip_hcc.h
+++ b/src/hip_hcc.h
@@ -204,7 +204,8 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr);
 #define HIP_INIT()\
 	std::call_once(hip_initialized, ihipInit);\
     ihipCtxStackUpdate();
-
+#define HIP_SET_DEVICE()\
+    ihipDeviceSetState();
 
 // This macro should be called at the beginning of every HIP API.
 // It initialies the hip runtime (exactly once), and
@@ -566,6 +567,8 @@ public:
 
     ihipCtx_t               *_primaryCtx;
 
+    int                      _state; //1 if device is set otherwise 0
+
 private:
     hipError_t initProperties(hipDeviceProp_t* prop);
 };
@@ -703,6 +706,7 @@ extern ihipCtx_t    *ihipGetTlsDefaultCtx();
 extern void          ihipSetTlsDefaultCtx(ihipCtx_t *ctx);
 extern hipError_t    ihipSynchronize(void);
 extern void          ihipCtxStackUpdate();
+extern hipError_t    ihipDeviceSetState();
 
 extern ihipDevice_t *ihipGetDevice(int);
 ihipCtx_t * ihipGetPrimaryCtx(unsigned deviceIndex);
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index 7e1a1738a6..74578e9b4b 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -105,7 +105,7 @@ hipError_t hipHostGetDevicePointer(void **devicePointer, void *hostPointer, unsi
 hipError_t hipMalloc(void** ptr, size_t sizeBytes)
 {
     HIP_INIT_API(ptr, sizeBytes);
-
+    HIP_SET_DEVICE();
     hipError_t  hip_status = hipSuccess;
     // return NULL pointer when malloc size is 0
     if (sizeBytes == 0)
@@ -161,7 +161,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
 hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
 {
     HIP_INIT_API(ptr, sizeBytes, flags);
-
+    HIP_SET_DEVICE();
     hipError_t hip_status = hipSuccess;
 
     auto ctx = ihipGetTlsDefaultCtx();
@@ -233,7 +233,7 @@ hipError_t hipHostAlloc(void** ptr, size_t sizeBytes, unsigned int flags)
 hipError_t hipMallocPitch(void** ptr, size_t* pitch, size_t width, size_t height)
 {
     HIP_INIT_API(ptr, pitch, width, height);
-
+    HIP_SET_DEVICE();
     hipError_t  hip_status = hipSuccess;
 
     if(width == 0 || height == 0)
@@ -285,7 +285,7 @@ hipError_t hipMallocArray(hipArray** array, const hipChannelFormatDesc* desc,
         size_t width, size_t height, unsigned int flags)
 {
     HIP_INIT_API(array, desc, width, height, flags);
-
+    HIP_SET_DEVICE();
     hipError_t  hip_status = hipSuccess;
 
     auto ctx = ihipGetTlsDefaultCtx();

From 3a42a7642a2325b9514e89616f7d70459db41fbb Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Tue, 3 Jan 2017 22:17:16 -0600
Subject: [PATCH 02/18] tolerate spaces in hip args

---
 bin/hipcc          | 19 +++++-----
 src/hip_hcc.cpp    | 91 +++++++++++++++++++++++++++++++++++-----------
 src/hip_hcc.h      | 40 +++++++++++++++-----
 src/hip_stream.cpp | 10 +++--
 4 files changed, 116 insertions(+), 44 deletions(-)

diff --git a/bin/hipcc b/bin/hipcc
index 3e88b77693..ccea650776 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -244,6 +244,8 @@ my $toolArgs = "";  # arguments to pass to the hcc or nvcc tool
 
 foreach $arg (@ARGV)
 {
+    $trimarg = $arg;
+    $trimarg =~ s/^\s+|\s+$//g;  # Remive whitespace
     my $swallowArg = 0;
     if ($arg eq '-c') {
         $compileOnly = 1;
@@ -254,38 +256,37 @@ foreach $arg (@ARGV)
         $needLDFLAGS = 1;
     }
 
-    if(($arg eq '-stdlib=libc++') and ($setStdLib eq 0))
+    if(($trimarg eq '-stdlib=libc++') and ($setStdLib eq 0))
     {
         $HIPCXXFLAGS .= " -stdlib=libc++";
         $setStdLib = 1;
     }
-    if(($arg eq '-stdlib=libstdc++') and ($setStdLib eq 0))
+    if(($trimarg eq '-stdlib=libstdc++') and ($setStdLib eq 0))
     {
         $HIPCXXFLAGS .= " -stdlib=libstdc++";
         $HIPCXXFLAGS .= $HCC_WA_FLAGS;
         $setStdLib = 1;
     }
-    if($arg eq '--version') {
+    if($trimarg eq '--version') {
         $printHipVersion = 1;
     }
-    if($arg eq '--short-version') {
+    if($trimarg eq '--short-version') {
         $printHipVersion = 1;
         $runCmd = 0;
     }
-    if($arg eq '-M') {
+    if($trimarg eq '-M') {
         $compileOnly = 1;
         $buildDeps = 1;
     }
-    if($arg eq '-use_fast_math') {
-        print "In fast Math";
+    if($trimarg eq '-use_fast_math') {
         $HIPCXXFLAGS .= " -DHIP_FAST_MATH ";
     }
-    if(($arg eq '-use-staticlib') and ($setLinkType eq 0))
+    if(($trimarg eq '-use-staticlib') and ($setLinkType eq 0))
     {
         $linkType = 0;
         $setLinkType = 1;
     }
-    if(($arg eq '-use-sharedlib') and ($setLinkType eq 0))
+    if(($trimarg eq '-use-sharedlib') and ($setLinkType eq 0))
     {
         $linkType = 1;
         $setLinkType = 1;
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index d760ade15d..c87e201c0c 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -69,6 +69,8 @@ std::string HIP_LAUNCH_BLOCKING_KERNELS;
 std::vector<std::string> g_hipLaunchBlockingKernels;
 int HIP_API_BLOCKING = 0;
 
+int HIP_MAX_QUEUES = 0;
+
 int HIP_PRINT_ENV = 0;
 int HIP_TRACE_API= 0;
 std::string HIP_TRACE_API_COLOR("green");
@@ -254,7 +256,7 @@ ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int
     };
 
 
-    tprintf(DB_SYNC, " streamCreate: stream=%p\n", this);
+    tprintf(DB_SYNC, " streamCreate: stream=%s\n", ToString(this).c_str());
 };
 
 
@@ -269,7 +271,7 @@ ihipStream_t::~ihipStream_t()
 void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty)
 {
     if (! assertQueueEmpty) {
-        tprintf (DB_SYNC, "stream %p wait for queue-empty..\n", this);
+        tprintf (DB_SYNC, "stream %s wait for queue-empty..\n", ToString(this).c_str());
         hc::hcWaitMode waitMode = hc::hcWaitModeActive;
 
         if (_scheduleMode == Auto) {
@@ -353,6 +355,7 @@ ihipCtx_t * ihipStream_t::getCtx() const
 // Lock the stream to prevent other threads from intervening.
 LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
 {
+
     LockedAccessor_StreamCrit_t crit(_criticalData, false/*no unlock at destruction*/);
 
     if(crit->_kernelCnt > HIP_NUM_KERNELS_INFLIGHT){
@@ -361,6 +364,17 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
     }
     crit->_kernelCnt++;
 
+    if (HIP_MAX_QUEUES && !crit->_hasQueue)  {
+        // Obtain mutex access to the device critical data, release by destructor
+        LockedAccessor_CtxCrit_t  ctxCrit(this->_ctx->criticalData());
+        crit->_av = this->_ctx->stealActiveQueue(ctxCrit, this);
+        crit->_hasQueue = true;
+    }
+
+
+
+    assert(crit->_hasQueue);
+
     return crit;
 }
 
@@ -391,21 +405,22 @@ void ihipStream_t::lockclose_postKernelCommand(const char * kernelName, hc::acce
 };
 
 
-//=============================================================================
-// Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
-// The packed _peerAgents can efficiently be used on each memory allocation.
-template<>
-void ihipCtxCriticalBase_t<CtxMutex>::recomputePeerAgents()
-{
-    _peerCnt = 0;
-    std::for_each (_peers.begin(), _peers.end(), [this](ihipCtx_t* ctx) {
-        _peerAgents[_peerCnt++] = ctx->getDevice()->_hsaAgent;
-    });
-}
+
+    //=============================================================================
+    // Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
+    // The packed _peerAgents can efficiently be used on each memory allocation.
+    template<>
+    void ihipCtxCriticalBase_t<CtxMutex>::recomputePeerAgents()
+    {
+        _peerCnt = 0;
+        std::for_each (_peers.begin(), _peers.end(), [this](ihipCtx_t* ctx) {
+            _peerAgents[_peerCnt++] = ctx->getDevice()->_hsaAgent;
+        });
+    }
 
 
-template<>
-bool ihipCtxCriticalBase_t<CtxMutex>::isPeerWatcher(const ihipCtx_t *peer)
+    template<>
+    bool ihipCtxCriticalBase_t<CtxMutex>::isPeerWatcher(const ihipCtx_t *peer)
 {
     auto match = std::find(_peers.begin(), _peers.end(), peer);
     return (match != std::end(_peers));
@@ -880,6 +895,44 @@ std::string ihipCtx_t::toString() const
   return ss.str();
 };
 
+
+hc::accelerator_view 
+ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream )
+{
+
+    // TODO - review handling if queue can't be found.
+    while (1) {
+        for (auto iter=ctxCrit->streams().begin(); iter != ctxCrit->streams().end(); iter++) {
+            if (*iter != needyStream) {
+                auto victimCritPtr = (*iter)->_criticalData.mtry_lock();
+                if (victimCritPtr && victimCritPtr->_hasQueue && (victimCritPtr->_kernelCnt == 0)) {
+
+
+                    victimCritPtr->_hasQueue = false;
+
+                    tprintf(DB_SYNC, " stealActiveQueue move queue from victim:%s to needy:%s\n",
+                            ToString(*iter).c_str(), ToString(needyStream).c_str());
+
+                    return victimCritPtr->_av;
+                }
+            }
+        }
+    }
+}
+
+
+hc::accelerator_view
+ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit)
+{
+    if (HIP_MAX_QUEUES && (ctxCrit->streams().size() >= HIP_MAX_QUEUES)) {
+        // Steal a queue from an existing stream:
+        return this->stealActiveQueue (ctxCrit, nullptr);
+    } else {
+        // Create a new view
+        return getWriteableDevice()->_acc.create_view();
+    }
+}
+
 //----
 
 
@@ -921,13 +974,6 @@ void ihipCtx_t::locked_syncDefaultStream(bool waitOnSelf)
     }
 }
 
-//---
-void ihipCtx_t::locked_addStream(ihipStream_t *s)
-{
-    LockedAccessor_CtxCrit_t  crit(_criticalData);
-
-    crit->addStream(s);
-}
 
 //---
 void ihipCtx_t::locked_removeStream(ihipStream_t *s)
@@ -1217,6 +1263,7 @@ void ihipInit()
     READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed.  Impacts hipMemcpyAsync, hipMemsetAsync." );
     
 
+    READ_ENV_I(release, HIP_MAX_QUEUES, 0, "Maximum number of queues that this app will use per-device.  Additional streams will share the specified number of queues.  0=no limit.");
 
     READ_ENV_C(release, HIP_DB, 0,  "Print debug info.  Bitmask (HIP_DB=0xff) or flags separated by '+' (HIP_DB=api+sync+mem+copy)", HIP_DB_callback);
     if ((HIP_DB & (1<<DB_API))  && (HIP_TRACE_API == 0)) {
diff --git a/src/hip_hcc.h b/src/hip_hcc.h
index 8a4d457cb1..f2a2fb49fa 100644
--- a/src/hip_hcc.h
+++ b/src/hip_hcc.h
@@ -235,8 +235,7 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr);
 #define DB_SYNC   1 /* 0x02 - trace synchronization pieces */
 #define DB_MEM    2 /* 0x04 - trace memory allocation / deallocation */
 #define DB_COPY   3 /* 0x08 - trace memory copy and peer commands. . */
-#define DB_SIGNAL 4 /* 0x10 - trace signal pool commands */
-#define DB_MAX_FLAG 5
+#define DB_MAX_FLAG 4
 // When adding a new debug flag, also add to the char name table below.
 //
 
@@ -251,7 +250,6 @@ static const DbName dbName [] =
     {KYEL, "sync"},
     {KCYN, "mem"},
     {KMAG, "copy"},
-    {KRED, "signal"},
 };
 
 
@@ -366,6 +364,7 @@ struct LockedBase {
     // Most uses should use the lock-accessor.
     void lock() { _mutex.lock(); }
     void unlock() { _mutex.unlock(); }
+    bool try_lock() { return _mutex.try_lock(); }
 
     MUTEX_TYPE  _mutex;
 };
@@ -402,7 +401,8 @@ class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
 public:
     ihipStreamCriticalBase_t(hc::accelerator_view av) :
         _kernelCnt(0),
-        _av(av)
+        _av(av),
+        _hasQueue(true)
     {
     };
 
@@ -410,11 +410,20 @@ public:
     }
 
     ihipStreamCriticalBase_t<StreamMutex>  * mlock() { LockedBase<MUTEX_TYPE>::lock(); return this;};
+    ihipStreamCriticalBase_t<StreamMutex>  * mtry_lock() { 
+        return LockedBase<MUTEX_TYPE>::try_lock() ?  this: nullptr; 
+    };
 
 public:
-    // TODO - remove _kernelCnt mechanism:
     uint32_t                    _kernelCnt;    // Count of inflight kernels in this stream.  Reset at ::wait().
+
     hc::accelerator_view        _av;
+
+    // True if the stream has an allocated queue (accelerato_view) for its use:
+    // Always true at ihipStream creation but queue may later be stolen.
+    // This acts as a valid bit for the _av.
+    bool                        _hasQueue;
+private:
 };
 
 
@@ -422,6 +431,7 @@ public:
 // for the ihipCtx_t and then for the individual streams.  The locks should not be acquired in reverse order
 // or deadlock may occur.  In some cases, it may be possible to reduce the range where the locks must be held.
 // HIP routines should avoid acquiring and releasing the same lock during the execution of a single HIP API.
+// Another option is to use try_lock in the innermost lock query.
 
 
 typedef ihipStreamCriticalBase_t<StreamMutex> ihipStreamCritical_t;
@@ -436,6 +446,7 @@ public:
     enum ScheduleMode {Auto, Spin, Yield};
     typedef uint64_t SeqNum_t ;
 
+    // TODOD -make av a reference to avoid shared_ptr overhead?
     ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int flags);
     ~ihipStream_t();
 
@@ -499,11 +510,14 @@ private:
 
     bool canSeeMemory(const ihipCtx_t *thisCtx, const hc::AmPointerInfo *dstInfo, const hc::AmPointerInfo *srcInfo);
 
-
-private: // Data
+public: // TODO - move private
     // Critical Data - MUST be accessed through LockedAccessor_StreamCrit_t
     ihipStreamCritical_t        _criticalData;
 
+private: // Data
+
+    std::mutex                 _hasQueueLock;
+
     ihipCtx_t  *_ctx;  // parent context that owns this stream.
 
     // Friends:
@@ -602,6 +616,7 @@ public:
     const std::list<ihipStream_t*> &const_streams() const { return _streams; };
 
 
+
     // Peer Accessor classes:
     bool isPeerWatcher(const ihipCtx_t *peer); // returns True if peer has access to memory physically located on this device.
     bool addPeerWatcher(const ihipCtx_t *thisCtx, ihipCtx_t *peer);
@@ -651,17 +666,22 @@ public: // Functions:
     ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags); // note: calls constructor for _criticalData
     ~ihipCtx_t();
 
-    // Functions which read or write the critical data are named locked_.
+    // Functions which read or write the critical data are named locked_. 
+    // (might be better called "locking_"
     // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a locked_ function from within a locked_ function.
     // External functions which call several locked_ functions will acquire and release the lock for each function.  if this occurs in
     // performance-sensitive code we may want to refactor by adding non-locked functions and creating a new locked_ member function to call them all.
-    void locked_addStream(ihipStream_t *s);
     void locked_removeStream(ihipStream_t *s);
     void locked_reset();
     void locked_waitAllStreams();
     void locked_syncDefaultStream(bool waitOnSelf);
 
-    ihipCtxCritical_t  &criticalData() { return _criticalData; }; // TODO, move private.  Fix P2P.
+    // Will allocate a queue and assign it to the needyStream:
+    hc::accelerator_view  stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit,
+                        ihipStream_t *needyStream);
+    hc::accelerator_view createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit);
+
+    ihipCtxCritical_t  &criticalData() { return _criticalData; }; 
 
     const ihipDevice_t *getDevice() const { return _device; };
     int                 getDeviceNum() const { return _device->_deviceId; };
diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp
index 8350035357..d754ffe5f6 100644
--- a/src/hip_stream.cpp
+++ b/src/hip_stream.cpp
@@ -45,11 +45,15 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags)
         //Note this is an execute_in_order queue, so all kernels submitted will atuomatically wait for prev to complete:
         //This matches CUDA stream behavior:
 
-        auto istream = new ihipStream_t(ctx, acc.create_view(), flags);
+        {
+            // Obtain mutex access to the device critical data, release by destructor
+            LockedAccessor_CtxCrit_t  ctxCrit(ctx->criticalData());
+            auto istream = new ihipStream_t(ctx, ctx->createOrStealQueue(ctxCrit), flags);
 
-        ctx->locked_addStream(istream);
+            ctxCrit->addStream(istream);
+            *stream = istream;
+        }
 
-        *stream = istream;
         tprintf(DB_SYNC, "hipStreamCreate, stream=%p\n", *stream);
     } else {
         e = hipErrorInvalidDevice;

From fd209f37d97c71a2fedd63b7ffa54f1a1f77ab6f Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Mon, 26 Dec 2016 20:26:56 -0600
Subject: [PATCH 03/18] Add more notes on debugging HIP apps.

---
 docs/markdown/hip_profiling.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md
index 0c55acf85e..61f1bbcfbc 100644
--- a/docs/markdown/hip_profiling.md
+++ b/docs/markdown/hip_profiling.md
@@ -340,3 +340,25 @@ These options cause HCC to serialize.  Useful if you have libraries or code whic
 - HSA_ENABLE_SDMA=0     : Causes host-to-device and device-to-host copies to use compute shader blit kernels rather than the dedicated DMA copy engines.  Compute shader copies have low latency (typically < 5us) and can achieve approximately 80% of the bandwidth of the DMA copy engine.  This flag is useful to isolate issues with the hardware copy engines.
 - HSA_ENABLE_INTERRUPT=0 : Causes completion signals to be detected with memory-based polling rather than interrupts.  Can be useful to diagnose interrupt storm issues in the driver.
 - HSA_DISABLE_CACHE=1  : Disables the GPU L2 data cache.
+
+### Debugging HIP Applications
+
+- The variable "tls_tidInfo" contains the API sequence number (_apiSeqNum)- a monotonically increasing count of the HIP APIs called from this thread.  This can be useful for setting conditional breakpoints.  Also, each new HIP thread is mapped to monotically increasing shortTid ID.  Both of these fields are displayed in the HIP debug info. 
+```
+(gdb) p tls_tidInfo
+$32 = {_shortTid = 1, _apiSeqNum = 803}
+```
+
+- HCC tracks all of the application memory allocations, including those from HIP and HC's "am_alloc".  These can be printed by calling the function 'hc::am_memtracker_print()'.
+An optional argument specifies a void * targetPointer - the print routine will mark the allocation which contains the specified pointer with "-->" in the printed output.
+This example shows a sample GDB session where we print the memory allocated by this process and mark a specified address by using the gdb "call" function..
+The gdb syntax also supports using the variable name (in this case 'dst'):
+```
+(gdb) p dst
+$33 = (void *) 0x5ec7e9000
+(gdb) call hc::am_memtracker_print(dst)
+TargetAddress:0x5ec7e9000
+   0x504cfc000-0x504cfc00f::  allocSeqNum:1 hostPointer:0x504cfc000 devicePointer:0x504cfc000 sizeBytes:16 isInDeviceMem:0 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
+...
+-->0x5ec7e9000-0x5f7e28fff::  allocSeqNum:488 hostPointer:(nil) devicePointer:0x5ec7e9000 sizeBytes:191102976 isInDeviceMem:1 isAmManaged:1 appId:0 appAllocFlags:0 appPtr:(nil)
+```

From 93fbc9cf7b32adc478d1965d83de3bd5e83ee4f7 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Wed, 4 Jan 2017 14:38:18 -0600
Subject: [PATCH 04/18] First pass at virtualized queue support.

Also updated stream debug messages to consistently use trace_helper.
---
 src/hip_hcc.cpp    | 59 ++++++++++++++++++--------------
 src/hip_hcc.h      | 84 +++++++++++++++++++++++++++-------------------
 src/hip_memory.cpp |  7 ++--
 src/hip_stream.cpp |  2 +-
 4 files changed, 88 insertions(+), 64 deletions(-)

diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index c87e201c0c..3bb3d6d128 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -243,7 +243,7 @@ ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int
     _id(0), // will be set by add function.
     _flags(flags),
     _ctx(ctx),
-    _criticalData(av)
+    _criticalData(this, av)
 {
     unsigned schedBits = ctx->_ctxFlags & hipDeviceScheduleMask;
 
@@ -256,7 +256,6 @@ ihipStream_t::ihipStream_t(ihipCtx_t *ctx, hc::accelerator_view av, unsigned int
     };
 
 
-    tprintf(DB_SYNC, " streamCreate: stream=%s\n", ToString(this).c_str());
 };
 
 
@@ -271,7 +270,7 @@ ihipStream_t::~ihipStream_t()
 void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty)
 {
     if (! assertQueueEmpty) {
-        tprintf (DB_SYNC, "stream %s wait for queue-empty..\n", ToString(this).c_str());
+        tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str());
         hc::hcWaitMode waitMode = hc::hcWaitModeActive;
 
         if (_scheduleMode == Auto) {
@@ -406,21 +405,21 @@ void ihipStream_t::lockclose_postKernelCommand(const char * kernelName, hc::acce
 
 
 
-    //=============================================================================
-    // Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
-    // The packed _peerAgents can efficiently be used on each memory allocation.
-    template<>
-    void ihipCtxCriticalBase_t<CtxMutex>::recomputePeerAgents()
-    {
-        _peerCnt = 0;
-        std::for_each (_peers.begin(), _peers.end(), [this](ihipCtx_t* ctx) {
-            _peerAgents[_peerCnt++] = ctx->getDevice()->_hsaAgent;
-        });
-    }
+//=============================================================================
+// Recompute the peercnt and the packed _peerAgents whenever a peer is added or deleted.
+// The packed _peerAgents can efficiently be used on each memory allocation.
+template<>
+void ihipCtxCriticalBase_t<CtxMutex>::recomputePeerAgents()
+{
+    _peerCnt = 0;
+    std::for_each (_peers.begin(), _peers.end(), [this](ihipCtx_t* ctx) {
+        _peerAgents[_peerCnt++] = ctx->getDevice()->_hsaAgent;
+    });
+}
 
 
-    template<>
-    bool ihipCtxCriticalBase_t<CtxMutex>::isPeerWatcher(const ihipCtx_t *peer)
+template<>
+bool ihipCtxCriticalBase_t<CtxMutex>::isPeerWatcher(const ihipCtx_t *peer)
 {
     auto match = std::find(_peers.begin(), _peers.end(), peer);
     return (match != std::end(_peers));
@@ -489,6 +488,7 @@ void ihipCtxCriticalBase_t<CtxMutex>::addStream(ihipStream_t *stream)
 {
     stream->_id = _streams.size();
     _streams.push_back(stream);
+    tprintf(DB_SYNC, " addStream: %s\n", ToString(stream).c_str());
 }
 //=============================================================================
 
@@ -827,11 +827,11 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
 ihipCtx_t::ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags) :
     _ctxFlags(flags),
     _device(device),
-    _criticalData(deviceCnt)
+    _criticalData(this, deviceCnt)
 {
     locked_reset();
 
-    tprintf(DB_SYNC, "created ctx with defaultStream=%p\n", _defaultStream);
+    tprintf(DB_SYNC, "created ctx with defaultStream=%p (%s)\n", _defaultStream, ToString(_defaultStream).c_str());
 };
 
 
@@ -861,7 +861,7 @@ void ihipCtx_t::locked_reset()
     for (auto streamI=crit->const_streams().begin(); streamI!=crit->const_streams().end(); streamI++) {
         ihipStream_t *stream = *streamI;
         (*streamI)->locked_wait();
-        tprintf(DB_SYNC, " delete stream=%p\n", stream);
+        tprintf(DB_SYNC, " delete %s\n", ToString(stream).c_str());
 
         delete stream;
     }
@@ -905,15 +905,24 @@ ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *nee
         for (auto iter=ctxCrit->streams().begin(); iter != ctxCrit->streams().end(); iter++) {
             if (*iter != needyStream) {
                 auto victimCritPtr = (*iter)->_criticalData.mtry_lock();
-                if (victimCritPtr && victimCritPtr->_hasQueue && (victimCritPtr->_kernelCnt == 0)) {
+                if (victimCritPtr)   {
+                    if (victimCritPtr->_hasQueue && (victimCritPtr->_kernelCnt == 0)) {
 
+                        victimCritPtr->_hasQueue = false;
 
-                    victimCritPtr->_hasQueue = false;
+                        tprintf(DB_SYNC, " stealActiveQueue from victim:%s to needy:%s\n",
+                                ToString(*iter).c_str(), ToString(needyStream).c_str());
 
-                    tprintf(DB_SYNC, " stealActiveQueue move queue from victim:%s to needy:%s\n",
-                            ToString(*iter).c_str(), ToString(needyStream).c_str());
+                        // TODO - cleanup to remove forced setting to N
+                        hc::accelerator_view  av = victimCritPtr->_av;
+                        uint64_t *p = (uint64_t*)(&victimCritPtr->_av);
+                        *p = 0; // damage the victim av so attempt to use it will fault.
 
-                    return victimCritPtr->_av;
+                        (*iter)->_criticalData.munlock(); 
+                        return av; 
+                    }  else {
+                        (*iter)->_criticalData.munlock(); 
+                    }
                 }
             }
         }
@@ -1415,7 +1424,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t stream)
     } else {
         // ALl streams have to wait for legacy default stream to be empty:
         if (!(stream->_flags & hipStreamNonBlocking))  {
-            tprintf(DB_SYNC, "stream %p wait default stream\n", stream);
+            tprintf(DB_SYNC, "%s wait default stream\n", ToString(stream).c_str());
             stream->getCtx()->_defaultStream->locked_wait();
         }
 
diff --git a/src/hip_hcc.h b/src/hip_hcc.h
index f2a2fb49fa..876e5df816 100644
--- a/src/hip_hcc.h
+++ b/src/hip_hcc.h
@@ -292,6 +292,34 @@ extern "C" {
 const hipStream_t hipStreamNull = 0x0;
 
 
+/**
+ * HIP IPC Handle Size
+ */
+#define HIP_IPC_HANDLE_SIZE 64
+class ihipIpcMemHandle_t
+{
+public:
+#if USE_IPC
+    hsa_amd_ipc_memory_t ipc_handle; ///< ipc memory handle on ROCr
+#endif
+    char reserved[HIP_IPC_HANDLE_SIZE];
+    size_t psize;
+};
+
+
+class ihipModule_t {
+public:
+  hsa_executable_t executable;
+  hsa_code_object_t object;
+  std::string fileName;
+  void *ptr;
+  size_t size;
+
+  ihipModule_t() : executable(), object(), fileName(), ptr(nullptr), size(0) {}
+};
+
+
+//---
 // Used to remove lock, for performance or stimulating bugs.
 class FakeMutex
 {
@@ -330,21 +358,21 @@ public:
         _autoUnlock(autoUnlock)
 
     {
-        tprintf(DB_SYNC, "lock critical data %s.%p\n", typeid(T).name(), _criticalData);
+        tprintf(DB_SYNC, "lock criticalData=%p for %s\n", _criticalData, ToString(_criticalData->_parent).c_str());
         _criticalData->_mutex.lock();
     };
 
     ~LockedAccessor()
     {
         if (_autoUnlock) {
-        tprintf(DB_SYNC, "auto-unlock critical data %s.%p\n",typeid(T).name(),  _criticalData);
+        tprintf(DB_SYNC, "auto-unlock criticalData=%p for %s\n", _criticalData, ToString(_criticalData->_parent).c_str());
             _criticalData->_mutex.unlock();
         }
     }
 
     void unlock()
     {
-        tprintf(DB_SYNC, "unlock critical data %s.%p\n", typeid(T).name(), _criticalData);
+        tprintf(DB_SYNC, "unlock criticalData=%p for %s\n", _criticalData, ToString(_criticalData->_parent).c_str());
        _criticalData->_mutex.unlock();
     }
 
@@ -369,40 +397,16 @@ struct LockedBase {
     MUTEX_TYPE  _mutex;
 };
 
-/**
- * HIP IPC Handle Size
- */
-#define HIP_IPC_HANDLE_SIZE 64
-class ihipIpcMemHandle_t
-{
-public:
-#if USE_IPC
-    hsa_amd_ipc_memory_t ipc_handle; ///< ipc memory handle on ROCr
-#endif
-    char reserved[HIP_IPC_HANDLE_SIZE];
-    size_t psize;
-};
-
-
-class ihipModule_t {
-public:
-  hsa_executable_t executable;
-  hsa_code_object_t object;
-  std::string fileName;
-  void *ptr;
-  size_t size;
-
-  ihipModule_t() : executable(), object(), fileName(), ptr(nullptr), size(0) {}
-};
 
 template <typename MUTEX_TYPE>
 class ihipStreamCriticalBase_t : public LockedBase<MUTEX_TYPE>
 {
 public:
-    ihipStreamCriticalBase_t(hc::accelerator_view av) :
+    ihipStreamCriticalBase_t(ihipStream_t *parentStream, hc::accelerator_view av) :
         _kernelCnt(0),
         _av(av),
-        _hasQueue(true)
+        _hasQueue(true),
+        _parent(parentStream)
     {
     };
 
@@ -410,11 +414,20 @@ public:
     }
 
     ihipStreamCriticalBase_t<StreamMutex>  * mlock() { LockedBase<MUTEX_TYPE>::lock(); return this;};
+
+    void munlock() { 
+        tprintf(DB_SYNC, "munlock criticalData=%p for %s\n", this, ToString(this->_parent).c_str());
+        LockedBase<MUTEX_TYPE>::unlock(); 
+    };
+
     ihipStreamCriticalBase_t<StreamMutex>  * mtry_lock() { 
-        return LockedBase<MUTEX_TYPE>::try_lock() ?  this: nullptr; 
+        bool gotLock = LockedBase<MUTEX_TYPE>::try_lock() ;
+        tprintf(DB_SYNC, "mtry_lock=%d criticalData=%p for %s\n", gotLock, this, ToString(this->_parent).c_str());
+        return gotLock ?  this: nullptr; 
     };
 
 public:
+    ihipStream_t *              _parent;
     uint32_t                    _kernelCnt;    // Count of inflight kernels in this stream.  Reset at ::wait().
 
     hc::accelerator_view        _av;
@@ -596,8 +609,9 @@ template <typename MUTEX_TYPE>
 class ihipCtxCriticalBase_t : LockedBase<MUTEX_TYPE>
 {
 public:
-    ihipCtxCriticalBase_t(unsigned deviceCnt) :
-         _peerCnt(0)
+    ihipCtxCriticalBase_t(ihipCtx_t *parentCtx, unsigned deviceCnt) :
+        _parent(parentCtx),
+        _peerCnt(0)
     {
         _peerAgents = new hsa_agent_t[deviceCnt];
     };
@@ -633,6 +647,8 @@ public:
 
     friend class LockedAccessor<ihipCtxCriticalBase_t>;
 private:
+    ihipCtx_t     *              _parent;
+
     //--- Stream Tracker:
     std::list< ihipStream_t* > _streams;   // streams associated with this device.
 
@@ -739,7 +755,7 @@ hipStream_t ihipSyncAndResolveStream(hipStream_t);
 // Stream printf functions:
 inline std::ostream& operator<<(std::ostream& os, const ihipStream_t& s)
 {
-    os << "stream#";
+    os << "stream:";
     os << s.getDevice()->_deviceId;;
     os << '.';
     os << s._id;
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index 74578e9b4b..5bc77cf543 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -131,7 +131,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
                 LockedAccessor_CtxCrit_t crit(ctx->criticalData());
                 // the peerCnt always stores self so make sure the trace actually
                 peerCnt = crit->peerCnt();
-                tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allowed %d other peer(s) access\n",
+                tprintf(DB_MEM, " allocated device_mem ptr:%p size:%zu on dev:%d and allow access to %d other peer(s)\n",
                         *ptr, sizeBytes, device->_deviceId, peerCnt-1);
                 if (peerCnt > 1) {
 
@@ -841,7 +841,6 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
         if (HIP_API_BLOCKING) {
             tprintf (DB_SYNC, "%s LAUNCH_BLOCKING wait for hipMemsetAsync.\n", ToString(stream).c_str());
             cf.wait();
-            //tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed [stream:%p].\n", __func__, (void*)stream);
         }
     } else {
         e = hipErrorInvalidValue;
@@ -892,9 +891,9 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
 
 
         if (HIP_LAUNCH_BLOCKING) {
-            tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset [stream:%p].\n", __func__, (void*)stream);
+            tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING wait for memset in %s.\n", __func__, ToString(stream).c_str());
             cf.wait();
-            tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed [stream:%p].\n", __func__, (void*)stream);
+            tprintf (DB_SYNC, "'%s' LAUNCH_BLOCKING memset completed in %s.\n", __func__, ToString(stream).c_str());
         }
     } else {
         e = hipErrorInvalidValue;
diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp
index d754ffe5f6..8641f72265 100644
--- a/src/hip_stream.cpp
+++ b/src/hip_stream.cpp
@@ -54,7 +54,7 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags)
             *stream = istream;
         }
 
-        tprintf(DB_SYNC, "hipStreamCreate, stream=%p\n", *stream);
+        tprintf(DB_SYNC, "hipStreamCreate, %s\n", ToString(*stream).c_str());
     } else {
         e = hipErrorInvalidDevice;
     }

From a3e0012567697a78272ba2e509875bfeb410a367 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Mon, 9 Jan 2017 17:19:40 -0600
Subject: [PATCH 05/18] Add HIP_MAX_QUEUES feature.

Includes some tricky manipulation of the locks for contexts and streams.
issue is that stealing a stream requires we lock the context to
walk the streams to find a victim.  To avoid deadlock, we can't
have a stream locked when we lock the context.  This implementation
releases the stream lock, then acquires the context and selects the
victim.
A more stable implemenation might be to copy the stream list
from a context so that a lock is not required to walk all streams.
Smart shared_ptr could be used to prevent the streams from being
deallocated during the walk.
---
 src/hip_hcc.cpp    | 78 ++++++++++++++++++++++++++++++++--------------
 src/hip_hcc.h      | 18 +++++------
 src/hip_memory.cpp |  3 ++
 src/hip_stream.cpp | 11 +++++--
 4 files changed, 76 insertions(+), 34 deletions(-)

diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index 3bb3d6d128..e5b7937e25 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -265,11 +265,37 @@ ihipStream_t::~ihipStream_t()
 }
 
 
+inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit)
+{
+    if (HIP_MAX_QUEUES && !streamCrit->_hasQueue)  {
+
+        // To avoid deadlock, we have to release the stream lock before acquiring context lock.
+        // Else we can get hung if another thread has the context lock is trying to get lock for this stream.
+        // We lock it again below.
+        streamCrit->munlock();
+
+        // Obtain mutex access to the device critical data, release by destructor
+        LockedAccessor_CtxCrit_t  ctxCrit(this->_ctx->criticalData());
+        // TODO
+        auto needyCritPtr = this->_criticalData.mlock();
+
+        // Second test to ensure we still need to steal the queue - another thread may have 
+        // snuck in here and already solved the issue.
+        if (!needyCritPtr->_hasQueue) {
+            needyCritPtr->_av = this->_ctx->stealActiveQueue(ctxCrit, this);
+        }
+        
+        streamCrit->_hasQueue = true;
+    }
+    assert(streamCrit->_hasQueue);
+}
+
+
 //Wait for all kernel and data copy commands in this stream to complete.
 //This signature should be used in routines that already have locked the stream mutex
-void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty)
+void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit)
 {
-    if (! assertQueueEmpty) {
+    if (crit->_hasQueue) {
         tprintf (DB_SYNC, "%s wait for queue-empty..\n", ToString(this).c_str());
         hc::hcWaitMode waitMode = hc::hcWaitModeActive;
 
@@ -294,6 +320,8 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty
         }
 
         crit->_av.wait(waitMode);
+    } else {
+        tprintf (DB_SYNC, "%s wait for queue empty (done since stream has no physical queue).\n", ToString(this).c_str());
     }
 
     crit->_kernelCnt = 0;
@@ -301,11 +329,11 @@ void ihipStream_t::wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty
 
 //---
 //Wait for all kernel and data copy commands in this stream to complete.
-void ihipStream_t::locked_wait(bool assertQueueEmpty)
+void ihipStream_t::locked_wait()
 {
     LockedAccessor_StreamCrit_t crit(_criticalData);
 
-    wait(crit, assertQueueEmpty);
+    wait(crit);
 
 };
 
@@ -314,6 +342,8 @@ void ihipStream_t::locked_waitEvent(hipEvent_t event)
 {
     LockedAccessor_StreamCrit_t crit(_criticalData);
 
+    this->ensureHaveQueue(crit);
+
     crit->_av.create_blocking_marker(event->_marker);
 }
 
@@ -324,6 +354,7 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event)
     // Lock the stream to prevent simultaneous access
     LockedAccessor_StreamCrit_t crit(_criticalData);
 
+    this->ensureHaveQueue(crit);
     event->_marker = crit->_av.create_marker();
 }
 
@@ -361,19 +392,11 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
        this->wait(crit);
        crit->_kernelCnt = 0;
     }
-    crit->_kernelCnt++;
 
-    if (HIP_MAX_QUEUES && !crit->_hasQueue)  {
-        // Obtain mutex access to the device critical data, release by destructor
-        LockedAccessor_CtxCrit_t  ctxCrit(this->_ctx->criticalData());
-        crit->_av = this->_ctx->stealActiveQueue(ctxCrit, this);
-        crit->_hasQueue = true;
-    }
+    this->ensureHaveQueue(crit);
+    
 
 
-
-    assert(crit->_hasQueue);
-
     return crit;
 }
 
@@ -896,16 +919,18 @@ std::string ihipCtx_t::toString() const
 };
 
 
-hc::accelerator_view 
-ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream )
+hc::accelerator_view
+ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream)
 {
 
     // TODO - review handling if queue can't be found.
     while (1) {
+
         for (auto iter=ctxCrit->streams().begin(); iter != ctxCrit->streams().end(); iter++) {
             if (*iter != needyStream) {
                 auto victimCritPtr = (*iter)->_criticalData.mtry_lock();
                 if (victimCritPtr)   {
+                    // try-lock succeeded:
                     if (victimCritPtr->_hasQueue && (victimCritPtr->_kernelCnt == 0)) {
 
                         victimCritPtr->_hasQueue = false;
@@ -913,16 +938,16 @@ ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *nee
                         tprintf(DB_SYNC, " stealActiveQueue from victim:%s to needy:%s\n",
                                 ToString(*iter).c_str(), ToString(needyStream).c_str());
 
+                        hc::accelerator_view av = victimCritPtr->_av;
+
                         // TODO - cleanup to remove forced setting to N
-                        hc::accelerator_view  av = victimCritPtr->_av;
                         uint64_t *p = (uint64_t*)(&victimCritPtr->_av);
                         *p = 0; // damage the victim av so attempt to use it will fault.
 
                         (*iter)->_criticalData.munlock(); 
-                        return av; 
-                    }  else {
-                        (*iter)->_criticalData.munlock(); 
-                    }
+                        return av;
+                    }  
+                    (*iter)->_criticalData.munlock(); 
                 }
             }
         }
@@ -935,7 +960,8 @@ ihipCtx_t::createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit)
 {
     if (HIP_MAX_QUEUES && (ctxCrit->streams().size() >= HIP_MAX_QUEUES)) {
         // Steal a queue from an existing stream:
-        return this->stealActiveQueue (ctxCrit, nullptr);
+        hc::accelerator_view av = this->stealActiveQueue (ctxCrit, nullptr);
+        return av;
     } else {
         // Create a new view
         return getWriteableDevice()->_acc.create_view();
@@ -1838,6 +1864,7 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
                  src, srcPtrInfo._hostPointer, srcPtrInfo._devicePointer, srcPtrInfo._sizeBytes,
                  srcPtrInfo._appId, srcTracked, srcPtrInfo._isInDeviceMem);
 
+        this->ensureHaveQueue(crit);
 
 #if USE_COPY_EXT_V2
         crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? &copyDevice->getDevice()->_acc : nullptr, forceUnpinnedCopy);
@@ -1902,6 +1929,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
 
             // Perform fast asynchronous copy - we know copyDevice != NULL based on check above
             try {
+                this->ensureHaveQueue(crit);
+
                 if (HIP_FORCE_SYNC_COPY) {
 #if USE_COPY_EXT_V2
                     crit->_av.copy_ext      (src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, &copyDevice->getDevice()->_acc, forceUnpinnedCopy);
@@ -1928,6 +1957,8 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
 
         } else {
             LockedAccessor_StreamCrit_t crit(_criticalData);
+
+            this->ensureHaveQueue(crit);
 #if USE_COPY_EXT_V2
             crit->_av.copy_ext(src, dst, sizeBytes, hcCopyDir, srcPtrInfo, dstPtrInfo, copyDevice ? &copyDevice->getDevice()->_acc : nullptr, forceUnpinnedCopy);
 #else
@@ -1985,6 +2016,7 @@ hipError_t hipHccGetAccelerator(int deviceId, hc::accelerator *acc)
 
 
 //---
+// Warning - with HIP_MAX_QUEUES!=0 there is no mechanism to prevent accelerator_view from being re-assigned...
 hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **av)
 {
     HIP_INIT_API(stream, av);
@@ -1994,7 +2026,7 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
         stream = device->_defaultStream;
     }
 
-    *av = stream->locked_getAv();
+    *av = stream->locked_getAv(); // TODO - review.  
 
     hipError_t err = hipSuccess;
     return ihipLogStatus(err);
diff --git a/src/hip_hcc.h b/src/hip_hcc.h
index 876e5df816..e19ce63263 100644
--- a/src/hip_hcc.h
+++ b/src/hip_hcc.h
@@ -358,21 +358,21 @@ public:
         _autoUnlock(autoUnlock)
 
     {
-        tprintf(DB_SYNC, "lock criticalData=%p for %s\n", _criticalData, ToString(_criticalData->_parent).c_str());
+        tprintf(DB_SYNC, "locking criticalData=%p for %s..\n", _criticalData, ToString(_criticalData->_parent).c_str());
         _criticalData->_mutex.lock();
     };
 
     ~LockedAccessor()
     {
         if (_autoUnlock) {
-        tprintf(DB_SYNC, "auto-unlock criticalData=%p for %s\n", _criticalData, ToString(_criticalData->_parent).c_str());
+        tprintf(DB_SYNC, "auto-unlocking criticalData=%p for %s...\n", _criticalData, ToString(_criticalData->_parent).c_str());
             _criticalData->_mutex.unlock();
         }
     }
 
     void unlock()
     {
-        tprintf(DB_SYNC, "unlock criticalData=%p for %s\n", _criticalData, ToString(_criticalData->_parent).c_str());
+        tprintf(DB_SYNC, "unlocking criticalData=%p for %s...\n", _criticalData, ToString(_criticalData->_parent).c_str());
        _criticalData->_mutex.unlock();
     }
 
@@ -416,13 +416,13 @@ public:
     ihipStreamCriticalBase_t<StreamMutex>  * mlock() { LockedBase<MUTEX_TYPE>::lock(); return this;};
 
     void munlock() { 
-        tprintf(DB_SYNC, "munlock criticalData=%p for %s\n", this, ToString(this->_parent).c_str());
+        tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this, ToString(this->_parent).c_str());
         LockedBase<MUTEX_TYPE>::unlock(); 
     };
 
     ihipStreamCriticalBase_t<StreamMutex>  * mtry_lock() { 
         bool gotLock = LockedBase<MUTEX_TYPE>::try_lock() ;
-        tprintf(DB_SYNC, "mtry_lock=%d criticalData=%p for %s\n", gotLock, this, ToString(this->_parent).c_str());
+        tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this, ToString(this->_parent).c_str());
         return gotLock ?  this: nullptr; 
     };
 
@@ -476,7 +476,7 @@ public:
     void                 lockclose_postKernelCommand(const char *kernelName, hc::accelerator_view *av);
 
 
-    void                 locked_wait(bool assertQueueEmpty=false);
+    void                 locked_wait();
 
     hc::accelerator_view* locked_getAv() { LockedAccessor_StreamCrit_t crit(_criticalData); return &(crit->_av); };
 
@@ -487,7 +487,7 @@ public:
     //---
 
     // Use this if we already have the stream critical data mutex:
-    void                 wait(LockedAccessor_StreamCrit_t &crit, bool assertQueueEmpty=false);
+    void                 wait(LockedAccessor_StreamCrit_t &crit);
 
     void launchModuleKernel(hc::accelerator_view av, hsa_signal_t signal,
                             uint32_t blockDimX, uint32_t blockDimY, uint32_t blockDimZ,
@@ -502,6 +502,7 @@ public:
     const ihipDevice_t *     getDevice() const;
     ihipCtx_t *              getCtx() const;
 
+    void ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCrit);
 
 public:
     //---
@@ -693,8 +694,7 @@ public: // Functions:
     void locked_syncDefaultStream(bool waitOnSelf);
 
     // Will allocate a queue and assign it to the needyStream:
-    hc::accelerator_view  stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit,
-                        ihipStream_t *needyStream);
+    hc::accelerator_view  stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream);
     hc::accelerator_view createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit);
 
     ihipCtxCritical_t  &criticalData() { return _criticalData; }; 
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index 5bc77cf543..372d295b89 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -813,6 +813,8 @@ hipError_t hipMemsetAsync(void* dst, int  value, size_t sizeBytes, hipStream_t s
     if (stream) {
         auto crit = stream->lockopen_preKernelCommand();
 
+        stream->ensureHaveQueue(crit);
+
         hc::completion_future cf ;
 
         if ((sizeBytes & 0x3) == 0) {
@@ -863,6 +865,7 @@ hipError_t hipMemset(void* dst, int  value, size_t sizeBytes )
     if (stream) {
         auto crit = stream->lockopen_preKernelCommand();
 
+        stream->ensureHaveQueue(crit);
         hc::completion_future cf ;
 
         if ((sizeBytes & 0x3) == 0) {
diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp
index 8641f72265..aae412160f 100644
--- a/src/hip_stream.cpp
+++ b/src/hip_stream.cpp
@@ -48,6 +48,7 @@ hipError_t ihipStreamCreate(hipStream_t *stream, unsigned int flags)
         {
             // Obtain mutex access to the device critical data, release by destructor
             LockedAccessor_CtxCrit_t  ctxCrit(ctx->criticalData());
+
             auto istream = new ihipStream_t(ctx, ctx->createOrStealQueue(ctxCrit), flags);
 
             ctxCrit->addStream(istream);
@@ -124,8 +125,14 @@ hipError_t hipStreamQuery(hipStream_t stream)
         stream =  device->_defaultStream;
     }
 
-    LockedAccessor_StreamCrit_t crit(stream->_criticalData);
-    int pendingOps = crit->_av.get_pending_async_ops();
+    int pendingOps = 0;
+
+    {
+        LockedAccessor_StreamCrit_t crit(stream->_criticalData);
+        if (crit->_hasQueue) {
+            pendingOps = crit->_av.get_pending_async_ops();
+        }
+    }
 
 
     hipError_t e = (pendingOps > 0) ? hipErrorNotReady : hipSuccess;

From a15d236de31dcc4f51b7cf582c6b8137cf28cd00 Mon Sep 17 00:00:00 2001
From: Ben Sander <ben.sander@amd.com>
Date: Mon, 9 Jan 2017 20:22:43 -0600
Subject: [PATCH 06/18] Fix delete[]

---
 tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp b/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp
index 7c83211f14..d5fc4cb20f 100644
--- a/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp
+++ b/tests/src/runtimeApi/multiThread/hipMultiThreadDevice.cpp
@@ -33,7 +33,7 @@ void createThenDestroyStreams(int iterations, int burstSize)
         }
     }
 
-    delete streams;
+    delete[] streams;
 }
 
 

From 7ed2b163de5a99f885cdc1a44f8bc5ac94d16f27 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Tue, 10 Jan 2017 17:54:22 +0300
Subject: [PATCH 07/18] [HIPIFY] CUdevice_attribute support up to CUDA 8.0.44

Attributes, which are not yet supported by HIP, are marked as HIP_UNSUPPORTED.
---
 hipify-clang/src/Cuda2Hip.cpp | 124 +++++++++++++++++++++++++++-------
 1 file changed, 99 insertions(+), 25 deletions(-)

diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp
index 4ea34606db..7995b3995f 100644
--- a/hipify-clang/src/Cuda2Hip.cpp
+++ b/hipify-clang/src/Cuda2Hip.cpp
@@ -319,31 +319,105 @@ struct cuda2hipMap {
     cuda2hipRename["CUdevice_attribute_enum"]                   = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER};
     cuda2hipRename["CUdevice_attribute"]                        = {"hipDeviceAttribute_t", CONV_TYPE, API_DRIVER};
 
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"]                = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"]                      = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"]                      = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"]                      = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"]                       = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"]                       = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"]                       = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"]          = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"]                = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"]                            = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"]              = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"]                           = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"]                    = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"]              = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"]              = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"]                         = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"]                        = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"]       = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"]             = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"]             = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"]                   = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"]                           = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"]                        = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"]                      = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK"]                   = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X"]                         = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y"]                         = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z"]                         = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X"]                          = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y"]                          = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z"]                          = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK"]             = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK"]                 = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY"]                   = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_WARP_SIZE"]                               = {"hipDeviceAttributeWarpSize", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK"]                 = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_REGISTERS_PER_BLOCK"]                     = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CLOCK_RATE"]                              = {"hipDeviceAttributeClockRate", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE"]                       = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH"]                 = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_MODE"]                            = {"hipDeviceAttributeComputeMode", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE"]                           = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR"]          = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR"]                = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR"]                = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS"]                      = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_BUS_ID"]                              = {"hipDeviceAttributePciBusId", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID"]                           = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR"]    = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_DRIVER};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"]                         = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER};
+    // unsupported yet by HIP
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"]                               = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"]                      = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"]                             = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"]                    = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"]                     = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_INTEGRATED"]                              = {"hipDeviceAttributeIntegrated", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY"]                     = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH"]                 = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH"]                 = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT"]                = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH"]                 = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT"]                = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH"]                 = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT"]        = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_WIDTH"]           = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_HEIGHT"]          = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_ARRAY_NUMSLICES"]       = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT"]                       = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_ECC_ENABLED"]                             = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TCC_DRIVER"]                              = {"hipDeviceAttributeTccDriver", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING"]                      = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_WIDTH"]          = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_GATHER_HEIGHT"]         = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH_ALTERNATE"]       = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT_ALTERNATE"]      = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH_ALTERNATE"]       = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID"]                           = {"hipDeviceAttributePciDomainId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_PITCH_ALIGNMENT"]                 = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_WIDTH"]            = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_WIDTH"]    = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURECUBEMAP_LAYERED_LAYERS"]   = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_WIDTH"]                 = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_WIDTH"]                 = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_HEIGHT"]                = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_WIDTH"]                 = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_HEIGHT"]                = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE3D_DEPTH"]                 = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE1D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"]        = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"]            = {"hipDeviceAttributeMaxSurfaceCudemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"]    = {"hipDeviceAttributeMaxSurfaceCudemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"]   = {"hipDeviceAttributeMaxSurfaceCudemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"]          = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"]          = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"]         = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_PITCH"]          = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_WIDTH"]       = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_MIPMAPPED_HEIGHT"]      = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_MIPMAPPED_WIDTH"]       = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_STREAM_PRIORITIES_SUPPORTED"]             = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_GLOBAL_L1_CACHE_SUPPORTED"]               = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_LOCAL_L1_CACHE_SUPPORTED"]                = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR"]        = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY"]                          = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD_GROUP_ID"]                = {"hipDeviceAttributeMultiGpuBoardGroupId", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX"]                                     = {"hipDeviceAttributeMax", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    // deprecated, do not use
+    // cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_TEX2D_GATHER"]                     = {"hipDeviceAttributeCanTex2DGather", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    // unsupported yet by HIP [CUDA 8.0.44]
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_HOST_NATIVE_ATOMIC_SUPPORTED"]            = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_SINGLE_TO_DOUBLE_PRECISION_PERF_RATIO"]   = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_PAGEABLE_MEMORY_ACCESS"]                  = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS"]               = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED"]            = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_CAN_USE_HOST_POINTER_FOR_REGISTERED_MEM"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
 
     cuda2hipRename["CUdevprop_st"]                              = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER};
     cuda2hipRename["CUdevprop"]                                 = {"hipDeviceProp_t", CONV_TYPE, API_DRIVER};

From 3a99536ed510f43a0b1131ae5cca187fc3a84ea2 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Tue, 10 Jan 2017 19:29:33 +0300
Subject: [PATCH 08/18] [HIPIFY] cudaDeviceAttr (RT API) support up to CUDA
 8.0.44

Attributes, which are not yet supported by HIP, are marked as HIP_UNSUPPORTED.
---
 hipify-clang/src/Cuda2Hip.cpp | 125 ++++++++++++++++++++++++++--------
 1 file changed, 97 insertions(+), 28 deletions(-)

diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp
index 7995b3995f..2f6d731c96 100644
--- a/hipify-clang/src/Cuda2Hip.cpp
+++ b/hipify-clang/src/Cuda2Hip.cpp
@@ -347,7 +347,9 @@ struct cuda2hipMap {
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTI_GPU_BOARD"]                         = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_DRIVER};
     // unsupported yet by HIP
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAX_PITCH"]                               = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT"]                       = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT"]                      = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    // Deprecated. Use instead CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_GPU_OVERLAP"]                             = {"hipDeviceAttributeAsyncEngineCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT"]                    = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT"]                     = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
@@ -392,9 +394,9 @@ struct cuda2hipMap {
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_WIDTH"]         = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_HEIGHT"]        = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACE2D_LAYERED_LAYERS"]        = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"]            = {"hipDeviceAttributeMaxSurfaceCudemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"]    = {"hipDeviceAttributeMaxSurfaceCudemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
-    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"]   = {"hipDeviceAttributeMaxSurfaceCudemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_WIDTH"]            = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_WIDTH"]    = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
+    cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_SURFACECUBEMAP_LAYERED_LAYERS"]   = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LINEAR_WIDTH"]          = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_WIDTH"]          = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
     cuda2hipRename["CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LINEAR_HEIGHT"]         = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_DRIVER, HIP_UNSUPPORTED};
@@ -703,31 +705,98 @@ struct cuda2hipMap {
     cuda2hipRename["cudaDeviceAttr"]          = {"hipDeviceAttribute_t", CONV_TYPE, API_RUNTIME};
     cuda2hipRename["cudaDeviceGetAttribute"]  = {"hipDeviceGetAttribute", CONV_DEV, API_RUNTIME};
 
-    cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"]               = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMaxBlockDimX"]                     = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMaxBlockDimY"]                     = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMaxBlockDimZ"]                     = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMaxGridDimX"]                      = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMaxGridDimY"]                      = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMaxGridDimZ"]                      = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"]          = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrTotalConstantMemory"]              = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrWarpSize"]                         = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"]             = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrClockRate"]                        = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMemoryClockRate"]                  = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"]             = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMultiProcessorCount"]              = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrComputeMode"]                      = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrL2CacheSize"]                      = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"]      = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrComputeCapabilityMajor"]           = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrComputeCapabilityMinor"]           = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrConcurrentKernels"]                = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrPciBusId"]                         = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrPciDeviceId"]                      = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"] = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME};
-    cuda2hipRename["cudaDevAttrIsMultiGpuBoard"]                  = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxThreadsPerBlock"]                = {"hipDeviceAttributeMaxThreadsPerBlock", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxBlockDimX"]                      = {"hipDeviceAttributeMaxBlockDimX", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxBlockDimY"]                      = {"hipDeviceAttributeMaxBlockDimY", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxBlockDimZ"]                      = {"hipDeviceAttributeMaxBlockDimZ", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxGridDimX"]                       = {"hipDeviceAttributeMaxGridDimX", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxGridDimY"]                       = {"hipDeviceAttributeMaxGridDimY", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxGridDimZ"]                       = {"hipDeviceAttributeMaxGridDimZ", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerBlock"]           = {"hipDeviceAttributeMaxSharedMemoryPerBlock", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrTotalConstantMemory"]               = {"hipDeviceAttributeTotalConstantMemory", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrWarpSize"]                          = {"hipDeviceAttributeWarpSize", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxRegistersPerBlock"]              = {"hipDeviceAttributeMaxRegistersPerBlock", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrClockRate"]                         = {"hipDeviceAttributeClockRate", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMemoryClockRate"]                   = {"hipDeviceAttributeMemoryClockRate", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrGlobalMemoryBusWidth"]              = {"hipDeviceAttributeMemoryBusWidth", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMultiProcessorCount"]               = {"hipDeviceAttributeMultiprocessorCount", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrComputeMode"]                       = {"hipDeviceAttributeComputeMode", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrL2CacheSize"]                       = {"hipDeviceAttributeL2CacheSize", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxThreadsPerMultiProcessor"]       = {"hipDeviceAttributeMaxThreadsPerMultiProcessor", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrComputeCapabilityMajor"]            = {"hipDeviceAttributeComputeCapabilityMajor", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrComputeCapabilityMinor"]            = {"hipDeviceAttributeComputeCapabilityMinor", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrConcurrentKernels"]                 = {"hipDeviceAttributeConcurrentKernels", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrPciBusId"]                          = {"hipDeviceAttributePciBusId", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrPciDeviceId"]                       = {"hipDeviceAttributePciDeviceId", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrMaxSharedMemoryPerMultiprocessor"]  = {"hipDeviceAttributeMaxSharedMemoryPerMultiprocessor", CONV_DEV, API_RUNTIME};
+    cuda2hipRename["cudaDevAttrIsMultiGpuBoard"]                   = {"hipDeviceAttributeIsMultiGpuBoard", CONV_DEV, API_RUNTIME};
+    // unsupported yet by HIP
+    cuda2hipRename["cudaDevAttrMaxPitch"]                          = {"hipDeviceAttributeMaxPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrTextureAlignment"]                  = {"hipDeviceAttributeTextureAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    // Is not deprecated as CUDA Driver's API analogue CU_DEVICE_ATTRIBUTE_GPU_OVERLAP
+    cuda2hipRename["cudaDevAttrGpuOverlap"]                        = {"hipDeviceAttributeGpuOverlap", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrKernelExecTimeout"]                 = {"hipDeviceAttributeKernelExecTimeout", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrIntegrated"]                        = {"hipDeviceAttributeIntegrated", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrCanMapHostMemory"]                  = {"hipDeviceAttributeCanMapHostMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture1DWidth"]                 = {"hipDeviceAttributeMaxTexture1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DWidth"]                 = {"hipDeviceAttributeMaxTexture2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DHeight"]                = {"hipDeviceAttributeMaxTexture2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture3DWidth"]                 = {"hipDeviceAttributeMaxTexture3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture3DHeight"]                = {"hipDeviceAttributeMaxTexture3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture3DDepth"]                 = {"hipDeviceAttributeMaxTexture3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DLayeredWidth"]          = {"hipDeviceAttributeMaxTexture2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DLayeredHeight"]         = {"hipDeviceAttributeMaxTexture2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DLayeredLayers"]         = {"hipDeviceAttributeMaxTexture2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrSurfaceAlignment"]                  = {"hipDeviceAttributeSurfaceAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrEccEnabled"]                        = {"hipDeviceAttributeEccEnabled", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrTccDriver"]                         = {"hipDeviceAttributeTccDriver", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrUnifiedAddressing"]                 = {"hipDeviceAttributeUnifiedAddressing", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture1DLayeredWidth"]          = {"hipDeviceAttributeMaxTexture1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture1DLayeredLayers"]         = {"hipDeviceAttributeMaxTexture1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DGatherWidth"]           = {"hipDeviceAttributeMaxTexture2DGatherWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DGatherHeight"]          = {"hipDeviceAttributeMaxTexture2DGatherHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture3DWidthAlt"]              = {"hipDeviceAttributeMaxTexture3DWidthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture3DHeightAlt"]             = {"hipDeviceAttributeMaxTexture3DHeightAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture3DDepthAlt"]              = {"hipDeviceAttributeMaxTexture3DDepthAlternate", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrPciDomainId"]                       = {"hipDeviceAttributePciDomainId", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrTexturePitchAlignment"]             = {"hipDeviceAttributeTexturePitchAlignment", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTextureCubemapWidth"]            = {"hipDeviceAttributeMaxTextureCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredWidth"]     = {"hipDeviceAttributeMaxTextureCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTextureCubemapLayeredLayers"]    = {"hipDeviceAttributeMaxTextureCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface1DWidth"]                 = {"hipDeviceAttributeMaxSurface1DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface2DWidth"]                 = {"hipDeviceAttributeMaxSurface2DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface2DHeight"]                = {"hipDeviceAttributeMaxSurface2DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface3DWidth"]                 = {"hipDeviceAttributeMaxSurface3DWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface3DHeight"]                = {"hipDeviceAttributeMaxSurface3DHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface3DDepth"]                 = {"hipDeviceAttributeMaxSurface3DDepth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface1DLayeredWidth"]          = {"hipDeviceAttributeMaxSurface1DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface1DLayeredLayers"]         = {"hipDeviceAttributeMaxSurface1DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface2DLayeredWidth"]          = {"hipDeviceAttributeMaxSurface2DLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface2DLayeredHeight"]         = {"hipDeviceAttributeMaxSurface2DLayeredHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurface2DLayeredLayers"]         = {"hipDeviceAttributeMaxSurface2DLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurfaceCubemapWidth"]            = {"hipDeviceAttributeMaxSurfaceCubemapWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredWidth"]     = {"hipDeviceAttributeMaxSurfaceCubemapLayeredWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxSurfaceCubemapLayeredLayers"]    = {"hipDeviceAttributeMaxSurfaceCubemapLayeredLayers", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture1DLinearWidth"]           = {"hipDeviceAttributeMaxTexture1DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DLinearWidth"]           = {"hipDeviceAttributeMaxTexture2DLinearWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DLinearHeight"]          = {"hipDeviceAttributeMaxTexture2DLinearHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DLinearPitch"]           = {"hipDeviceAttributeMaxTexture2DLinearPitch", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedWidth"]        = {"hipDeviceAttributeMaxTexture2DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture2DMipmappedHeight"]       = {"hipDeviceAttributeMaxTexture2DMipmappedHeight", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxTexture1DMipmappedWidth"]        = {"hipDeviceAttributeMaxTexture1DMipmappedWidth", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrStreamPrioritiesSupported"]         = {"hipDeviceAttributeStreamPrioritiesSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrGlobalL1CacheSupported"]            = {"hipDeviceAttributeGlobalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrLocalL1CacheSupported"]             = {"hipDeviceAttributeLocalL1CacheSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMaxRegistersPerMultiprocessor"]     = {"hipDeviceAttributeMaxRegistersPerMultiprocessor", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrManagedMemory"]                     = {"hipDeviceAttributeManagedMemory", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrMultiGpuBoardGroupID"]              = {"hipDeviceAttributeMultiGpuBoardGroupID", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    // unsupported yet by HIP [CUDA 8.0.44]
+    cuda2hipRename["cudaDevAttrHostNativeAtomicSupported"]         = {"hipDeviceAttributeHostNativeAtomicSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrSingleToDoublePrecisionPerfRatio"]  = {"hipDeviceAttributeSingleToDoublePrecisionPerfRatio", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrPageableMemoryAccess"]              = {"hipDeviceAttributePageableMemoryAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrConcurrentManagedAccess"]           = {"hipDeviceAttributeConcurrentManagedAccess", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrComputePreemptionSupported"]        = {"hipDeviceAttributeComputePreemptionSupported", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDevAttrCanUseHostPointerForRegisteredMem"] = {"hipDeviceAttributeCanUseHostPointerForRegisteredMem", CONV_DEV, API_RUNTIME, HIP_UNSUPPORTED};
 
     // Pointer Attributes
     cuda2hipRename["cudaPointerAttributes"]      = {"hipPointerAttribute_t", CONV_TYPE, API_RUNTIME};

From 9a0780001bfee903ff91ba464c23188faccffde7 Mon Sep 17 00:00:00 2001
From: Evgeny Mankov <Evgeniy.Mankov@amd.com>
Date: Tue, 10 Jan 2017 20:24:27 +0300
Subject: [PATCH 09/18] [HIPIFY] cudaDataType_t and libraryPropertyType_t
 support (CUDA 8.0.44 only)

All are marked as HIP_UNSUPPORTED.
IMPORTANT:
1. libraryPropertyType_t has no cuda prefix. => TO_DO: new matcher is needed.
2. all libraries (cublas, cufft, cusolver, cusparse, nvgraph) have started to use these types (since 8.0).
---
 hipify-clang/src/Cuda2Hip.cpp | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/hipify-clang/src/Cuda2Hip.cpp b/hipify-clang/src/Cuda2Hip.cpp
index 2f6d731c96..938c980684 100644
--- a/hipify-clang/src/Cuda2Hip.cpp
+++ b/hipify-clang/src/Cuda2Hip.cpp
@@ -585,6 +585,35 @@ struct cuda2hipMap {
     cuda2hipRename["cuProfilerStop"]                            = {"hipProfilerStop", CONV_OTHER, API_DRIVER};
 
     /////////////////////////////// CUDA RT API ///////////////////////////////
+    // Data types
+    // unsupported yet by HIP [CUDA 8.0.44]
+    cuda2hipRename["cudaDataType_t"]              = {"hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["cudaDataType"]                = {"hipDataType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_R_16F"]                  = {"hipR16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_C_16F"]                  = {"hipC16F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_R_32F"]                  = {"hipR32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_C_32F"]                  = {"hipC32F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_R_64F"]                  = {"hipR64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_C_64F"]                  = {"hipC64F", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_R_8I"]                   = {"hipR8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_C_8I"]                   = {"hipC8I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_R_8U"]                   = {"hipR8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_C_8U"]                   = {"hipC8U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_R_32I"]                  = {"hipR32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_C_32I"]                  = {"hipC32I", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_R_32U"]                  = {"hipR32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["CUDA_C_32U"]                  = {"hipC32U", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+
+    // Library property types
+    // IMPORTANT: no cuda prefix
+    // TO_DO: new matcher is needed
+    // unsupported yet by HIP [CUDA 8.0.44]
+    cuda2hipRename["libraryPropertyType_t"]       = {"hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["libraryPropertyType"]         = {"hipLibraryPropertyType_t", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["MAJOR_VERSION"]               = {"hipLibraryMajorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["MINOR_VERSION"]               = {"hipLibraryMinorVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+    cuda2hipRename["PATCH_LEVEL"]                 = {"hipLibraryPatchVersion", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED};
+
     // Error API
     cuda2hipRename["cudaGetLastError"]               = {"hipGetLastError", CONV_ERR, API_RUNTIME};
     cuda2hipRename["cudaPeekAtLastError"]            = {"hipPeekAtLastError", CONV_ERR, API_RUNTIME};

From 39910029a6c448b67e0cb1a4fc1a2d5c0878eddd Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 11 Jan 2017 15:06:25 -0600
Subject: [PATCH 10/18] Added proper device data types

Change-Id: I42029635ff68c3c13a764a3eda6447e6c77878c6
---
 include/hip/hcc_detail/device_functions.h |   11 +-
 include/hip/hcc_detail/hip_ldg.h          |    5 +-
 include/hip/hcc_detail/hip_runtime.h      |    4 +-
 include/hip/hcc_detail/hip_vector_types.h | 4295 +++++++++++++++++++--
 src/device_util.cpp                       |  455 +--
 src/hip_ldg.cpp                           |   12 +-
 tests/src/deviceLib/hip_test_ldg.cpp      |    2 +-
 7 files changed, 4010 insertions(+), 774 deletions(-)

diff --git a/include/hip/hcc_detail/device_functions.h b/include/hip/hcc_detail/device_functions.h
index 8fa870664f..e2b061c640 100644
--- a/include/hip/hcc_detail/device_functions.h
+++ b/include/hip/hcc_detail/device_functions.h
@@ -20,13 +20,18 @@ THE SOFTWARE.
 #ifndef HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
 #define HIP_HCC_DETAIL_DEVICE_FUNCTIONS_H
 
-#include "hip_runtime.h"
+#include <hip/hip_runtime.h>
+#include <hip/hip_vector_types.h>
 
 __device__ float __int_as_float (int x);
 
 __device__ double __hiloint2double (int hi, int lo);
 
-extern __HIP_DEVICE__ double  __longlong_as_double(long long int x);
-extern __HIP_DEVICE__ long long int __double_as_longlong(double x);
+__device__ char4 __hip_hc_add8pk(char4, char4);
+__device__ char4 __hip_hc_sub8pk(char4, char4);
+__device__ char4 __hip_hc_mul8pk(char4, char4);
+
+extern __device__ double  __longlong_as_double(long long int x);
+extern __device__ long long int __double_as_longlong(double x);
 
 #endif
diff --git a/include/hip/hcc_detail/hip_ldg.h b/include/hip/hcc_detail/hip_ldg.h
index 7dd6451749..65292951f0 100644
--- a/include/hip/hcc_detail/hip_ldg.h
+++ b/include/hip/hcc_detail/hip_ldg.h
@@ -25,8 +25,8 @@ THE SOFTWARE.
 
 #if __HCC__
 #if __hcc_workweek__ >= 16164
-#include "hip/hip_vector_types.h"
-#include "hip/hcc_detail/host_defines.h"
+#include "hip_vector_types.h"
+#include "host_defines.h"
 
 
 __device__ char                 __ldg(const char* );
@@ -75,4 +75,3 @@ __device__ double2              __ldg(const double2* );
 #endif  // __HCC__
 
 #endif  // HIP_LDG_H
-
diff --git a/include/hip/hcc_detail/hip_runtime.h b/include/hip/hcc_detail/hip_runtime.h
index 78accc0c5b..f747b446d7 100644
--- a/include/hip/hcc_detail/hip_runtime.h
+++ b/include/hip/hcc_detail/hip_runtime.h
@@ -46,6 +46,7 @@ THE SOFTWARE.
 #define CUDA_SUCCESS hipSuccess
 
 #include <hip/hip_runtime_api.h>
+
 //#include "hip/hcc_detail/hip_hcc.h"
 //---
 // Remainder of this file only compiles with HCC
@@ -815,9 +816,6 @@ extern "C" __device__ void* __hip_hc_free(void *ptr);
 //extern "C" __device__ void* malloc(size_t size);
 //extern "C" __device__ void* free(void *ptr);
 
-extern "C" __device__ char4 __hip_hc_add8pk(char4, char4);
-extern "C" __device__ char4 __hip_hc_sub8pk(char4, char4);
-extern "C" __device__ char4 __hip_hc_mul8pk(char4, char4);
 
 #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
 
diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h
index 7c48985996..812bd272d0 100644
--- a/include/hip/hcc_detail/hip_vector_types.h
+++ b/include/hip/hcc_detail/hip_vector_types.h
@@ -32,402 +32,4051 @@ THE SOFTWARE.
 #error("This version of HIP requires a newer version of HCC.");
 #endif
 
-#if 0
-#include <hc_short_vector.hpp>
+#include "host_defines.h"
 
-using namespace hc::short_vector;
+#define MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(type) \
+__device__ __host__ type() {} \
+__device__ __host__ type(type& val) : x(val.x) { } \
+__device__ __host__ type(const type& val) : x(val.x) { }
+
+#define MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(type) \
+__device__ __host__ type() {} \
+__device__ __host__ type(type& val) : x(val.x), y(val.y) { } \
+__device__ __host__ type(const type& val) : x(val.x), y(val.y) { }
+
+#define MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(type) \
+__device__ __host__ type() {} \
+__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z) { } \
+__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z) { }
+
+#define MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(type) \
+__device__ __host__ type() {} \
+__device__ __host__ type(type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { } \
+__device__ __host__ type(const type& val) : x(val.x), y(val.y), z(val.z), w(val.w) { }
 
 
-//-- Signed
-// Define char vector types
-typedef hc::short_vector::char1 char1;
-typedef hc::short_vector::char2 char2;
-typedef hc::short_vector::char3 char3;
-typedef hc::short_vector::char4 char4;
+#define MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(type, type1) \
+__device__ __host__ type(type1 val) : x(val) {} \
 
-// Define short vector types
-typedef hc::short_vector::short1 short1;
-typedef hc::short_vector::short2 short2;
-typedef hc::short_vector::short3 short3;
-typedef hc::short_vector::short4 short4;
+#define MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(type, type1) \
+__device__ __host__ type(type1 val) : x(val), y(val) {} \
+__device__ __host__ type(type1 val1, type1 val2) : x(val1), y(val2) {}
 
-// Define int vector types
-typedef hc::short_vector::int1 int1;
-typedef hc::short_vector::int2 int2;
-typedef hc::short_vector::int3 int3;
-typedef hc::short_vector::int4 int4;
+#define MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(type, type1) \
+__device__ __host__ type(type1 val) : x(val), y(val), z(val) {} \
+__device__ __host__ type(type1 val1, type1 val2, type1 val3) : x(val1), y(val2), z(val3) {}
 
-// Define long vector types
-typedef hc::short_vector::long1 long1;
-typedef hc::short_vector::long2 long2;
-typedef hc::short_vector::long3 long3;
-typedef hc::short_vector::long4 long4;
+#define MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(type, type1) \
+__device__ __host__ type(type1 val) : x(val), y(val), z(val), w(val) {} \
+__device__ __host__ type(type1 val1, type1 val2, type1 val3, type1 val4) : x(val1), y(val2), z(val3), w(val4) {}
 
-// Define longlong vector types
-typedef hc::short_vector::longlong1 longlong1;
-typedef hc::short_vector::longlong2 longlong2;
-typedef hc::short_vector::longlong3 longlong3;
-typedef hc::short_vector::longlong4 longlong4;
+struct uchar1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uchar1)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uchar1, signed long long)
 
+  #endif
+  unsigned char x;
 
-//-- Unsigned
-// Define uchar vector types
-typedef hc::short_vector::uchar1 uchar1;
-typedef hc::short_vector::uchar2 uchar2;
-typedef hc::short_vector::uchar3 uchar3;
-typedef hc::short_vector::uchar4 uchar4;
+} __attribute__((aligned(1)));
 
-// Define ushort vector types
-typedef hc::short_vector::ushort1 ushort1;
-typedef hc::short_vector::ushort2 ushort2;
-typedef hc::short_vector::ushort3 ushort3;
-typedef hc::short_vector::ushort4 ushort4;
+struct uchar2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uchar2)
 
-// Define uint vector types
-typedef hc::short_vector::uint1 uint1;
-typedef hc::short_vector::uint2 uint2;
-typedef hc::short_vector::uint3 uint3;
-typedef hc::short_vector::uint4 uint4;
-
-// Define ulong vector types
-typedef hc::short_vector::ulong1 ulong1;
-typedef hc::short_vector::ulong2 ulong2;
-typedef hc::short_vector::ulong3 ulong3;
-typedef hc::short_vector::ulong4 ulong4;
-
-// Define ulonglong vector types
-typedef hc::short_vector::ulonglong1 ulonglong1;
-typedef hc::short_vector::ulonglong2 ulonglong2;
-typedef hc::short_vector::ulonglong3 ulonglong3;
-typedef hc::short_vector::ulonglong4 ulonglong4;
-
-
-//-- Floating point
-// Define float vector types
-typedef hc::short_vector::float1 float1;
-typedef hc::short_vector::float2 float2;
-typedef hc::short_vector::float3 float3;
-typedef hc::short_vector::float4 float4;
-
-// Define double vector types
-typedef hc::short_vector::double1 double1;
-typedef hc::short_vector::double2 double2;
-typedef hc::short_vector::double3 double3;
-typedef hc::short_vector::double4 double4;
-
-#else
-
-#define __hip_align(name, val, data) \
-    __attribute__((aligned(val))) name \
-    { data }
-
-struct __hip_align(char1, 1, signed char x;);
-struct __hip_align(uchar1, 1, unsigned char x;);
-
-struct __hip_align(char2, 2, signed char x; signed char y;);
-struct __hip_align(uchar2, 2, unsigned char x; unsigned char y;);
-
-struct char3
-{
-    signed char x, y, z;
-};
-
-struct uchar3
-{
-    unsigned char x, y, z;
-};
-
-struct char4
-{
-    union {
-        signed char x, y, z, w;
-        unsigned int val;
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uchar2, signed long long)
+  #endif
+  union {
+    struct {
+      unsigned char x, y;
     };
+    unsigned short a;
+  };
+} __attribute__((aligned(2)));
+
+struct uchar3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uchar3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uchar3, signed long long)
+  #endif
+  unsigned char x, y, z;
 };
 
-struct uchar4
-{
-    union {
-        unsigned char x, y, z, w;
-        unsigned int val;
+struct uchar4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uchar4)
+
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uchar4, signed long long)
+  #endif
+  union {
+    struct {
+      unsigned char x, y, z, w;
     };
+    unsigned int a;
+  };
+} __attribute__((aligned(4)));
+
+
+struct char1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(char1)
+
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(char1, signed long long)
+  #endif
+  signed char x;
+} __attribute__((aligned(1)));
+
+struct char2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(char2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(char2, signed long long)
+  #endif
+  union {
+    struct {
+      signed char x, y;
+    };
+    unsigned short a;
+  };
+} __attribute__((aligned(2)));
+
+struct char3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(char3)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(char3, signed long long)
+  #endif
+  signed char x, y, z;
 };
 
-//struct __hip_align(char4, 4, signed char x; signed char y; signed char z; signed char w;);
-//struct __hip_align(uchar4, 4, unsigned char x; unsigned char y; unsigned char z; unsigned char w;);
+struct char4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(char4)
 
-struct __hip_align(short1, 2, signed short x;);
-struct __hip_align(ushort1, 2, unsigned short x;);
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(char4, signed long long)
+  #endif
+  union {
+    struct {
+      signed char x, y, z, w;
+    };
+    unsigned int a;
+  };
+} __attribute__((aligned(4)));
 
-struct __hip_align(short2, 4, signed short x; signed short y;);
-struct __hip_align(ushort2, 4, unsigned short x; unsigned short y;);
 
-struct short3
-{
-    signed short x, y, z;
+
+struct ushort1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ushort1)
+
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ushort1, signed long long)
+  #endif
+  unsigned short x;
+} __attribute__((aligned(2)));
+
+struct ushort2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ushort2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ushort2, signed long long)
+  #endif
+  union {
+    struct {
+      unsigned short x, y;
+    };
+    unsigned int a;
+  };
+} __attribute__((aligned(4)));
+
+struct ushort3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ushort3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ushort3, signed long long)
+  #endif
+  unsigned short x, y, z;
 };
 
-struct ushort3
-{
-    unsigned short x, y, z;
+struct ushort4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ushort4)
+
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ushort4, signed long long)
+  #endif
+  union {
+    struct {
+      unsigned short x, y, z, w;
+    };
+    unsigned int a, b;
+  };
+} __attribute__((aligned(8)));
+
+struct short1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(short1)
+
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(short1, signed long long)
+  #endif
+  signed short x;
+} __attribute__((aligned(2)));
+
+struct short2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(short2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(short2, signed long long)
+  #endif
+  union {
+    struct {
+      signed short x, y;
+    };
+    unsigned int a;
+  };
+
+} __attribute__((aligned(4)));
+
+struct short3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(short3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(short3, signed long long)
+  #endif
+  signed short x, y, z;
 };
 
-struct __hip_align(short4, 8, signed short x; signed short y; signed short z; signed short w;);
-struct __hip_align(ushort4, 8, unsigned short x; unsigned short y; unsigned short z; unsigned short w;);
+struct short4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(short4)
 
-struct __hip_align(int1, 4, signed int x;);
-struct __hip_align(uint1, 4, unsigned int x;);
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(short4, signed long long)
+  #endif
+  union {
+    struct {
+      signed short x, y, z, w;
+    };
+    unsigned int a, b;
+  };
+} __attribute__((aligned(8)));
 
-struct __hip_align(int2, 8, signed int x; signed int y;);
-struct __hip_align(uint2, 8, unsigned int x; unsigned int y;);
 
-struct int3{
-    signed int x, y, z;
-};
-struct uint3{
-    unsigned int x, y, z;
+struct uint1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(uint1)
+
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(uint1, signed long long)
+  #endif
+  unsigned int x;
+} __attribute__((aligned(4)));
+
+struct uint2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(uint2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(uint2, signed long long)
+  #endif
+  unsigned int x, y;
+} __attribute__((aligned(8)));
+
+struct uint3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(uint3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(uint3, signed long long)
+  #endif
+  unsigned int x, y, z;
 };
 
-struct __hip_align(int4, 16, signed int x; signed int y; signed int z; signed int w;);
-struct __hip_align(uint4, 16, unsigned int x; unsigned int y; unsigned int z; unsigned int w;);
+struct uint4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(uint4)
 
-struct __hip_align(long1, 8, long int x;);
-struct __hip_align(ulong1, 8, unsigned long x;);
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(uint4, signed long long)
+  #endif
+  unsigned int x, y, z, w;
+} __attribute__((aligned(16)));
 
-struct __hip_align(long2, 16, long int x; long int y;);
-struct __hip_align(ulong2, 16, unsigned long x; unsigned long y;);
+struct int1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(int1)
 
-struct long3{
-    long int x, y, z;
-};
-struct ulong3{
-    unsigned long x, y, z;
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(int1, signed long long)
+  #endif
+  signed int x;
+} __attribute__((aligned(4)));
+
+struct int2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(int2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(int2, signed long long)
+  #endif
+  signed int x, y;
+} __attribute__((aligned(8)));
+
+struct int3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(int3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(int3, signed long long)
+  #endif
+  signed int x, y, z;
 };
 
-struct __hip_align(long4, 32, long int x; long int y; long int z; long int w;);
-struct __hip_align(ulong4, 32, unsigned long x; unsigned long y; unsigned long z; unsigned long w;);
+struct int4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(int4)
 
-struct float1
-{
-    float x;
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(int4, signed long long)
+  #endif
+  signed int x, y, z, w;
+} __attribute__((aligned(16)));
+
+
+struct float1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(float1)
+
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(float1, signed long long)
+  #endif
+  float x;
+} __attribute__((aligned(4)));
+
+struct float2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(float2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(float2, signed long long)
+  #endif
+  float x, y;
+} __attribute__((aligned(8)));
+
+struct float3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(float3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(float3, signed long long)
+  #endif
+  float x, y, z;
 };
 
-struct __hip_align(float2, 8, float x; float y;);
+struct float4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(float4)
 
-struct float3
-{
-    float x, y, z;
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(float4, signed long long)
+  #endif
+  float x, y, z, w;
+} __attribute__((aligned(16)));
+
+
+
+struct double1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(double1)
+
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(double1, signed long long)
+  #endif
+  double x;
+} __attribute__((aligned(8)));
+
+struct double2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(double2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(double2, signed long long)
+  #endif
+  double x, y;
+} __attribute__((aligned(16)));
+
+struct double3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(double3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(double3, signed long long)
+  #endif
+  double x, y, z;
 };
 
-struct __hip_align(float4, 16, float x; float y; float z; float w;);
+struct double4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(double4)
 
-struct __hip_align(longlong1, 16, long long int x;);
-struct __hip_align(ulonglong1, 16, unsigned long long int x;);
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(double4, signed long long)
+  #endif
+  double x, y, z, w;
+} __attribute__((aligned(32)));
 
-struct __attribute__((aligned(32))) longlong2
-{
-    long long int x, y;
+
+struct ulong1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulong1)
+
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulong1, signed long long)
+  #endif
+  unsigned long x;
+} __attribute__((aligned(8)));
+
+struct ulong2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulong2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulong2, signed long long)
+  #endif
+  unsigned long x, y;
+} __attribute__((aligned(16)));
+
+struct ulong3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulong3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulong3, signed long long)
+  #endif
+  unsigned long x, y, z;
 };
 
-struct __attribute__((aligned(32))) ulonglong2
-{
-    unsigned long long int x, y;
+struct ulong4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulong4)
+
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulong4, signed long long)
+  #endif
+  unsigned long x, y, z, w;
+} __attribute__((aligned(32)));
+
+
+struct long1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(long1)
+
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(long1, signed long long)
+  #endif
+  signed long x;
+} __attribute__((aligned(8)));
+
+struct long2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(long2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(long2, signed long long)
+  #endif
+  signed long x, y;
+} __attribute__((aligned(16)));
+
+struct long3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(long3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(long3, signed long long)
+  #endif
+  signed long x, y, z;
 };
 
-struct longlong3
-{
-    long long int x, y, z;
+struct long4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(long4)
+
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(long4, signed long long)
+  #endif
+  signed long x, y, z, w;
+} __attribute__((aligned(32)));
+
+
+struct ulonglong1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1)
+
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(ulonglong1, signed long long)
+  #endif
+  unsigned long long x;
+} __attribute__((aligned(8)));
+
+struct ulonglong2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(ulonglong2, signed long long)
+  #endif
+  unsigned long long x, y;
+} __attribute__((aligned(16)));
+
+struct ulonglong3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(ulonglong3, signed long long)
+  #endif
+  unsigned long long x, y, z;
 };
 
-struct ulonglong3
-{
-    unsigned long long int x, y, z;
+struct ulonglong4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4)
+
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(ulonglong4, signed long long)
+  #endif
+  unsigned long long x, y, z, w;
+} __attribute__((aligned(32)));
+
+
+struct longlong1 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_ONE_COMPONENT(longlong1)
+
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, float)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, double)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_ONE_COMPONENT(longlong1, signed long long)
+  #endif
+  signed long long x;
+} __attribute__((aligned(8)));
+
+struct longlong2 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_TWO_COMPONENT(longlong2)
+
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, float)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, double)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_TWO_COMPONENT(longlong2, signed long long)
+  #endif
+  signed long long x, y;
+} __attribute__((aligned(16)));
+
+struct longlong3 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_THREE_COMPONENT(longlong3)
+
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, float)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, double)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_THREE_COMPONENT(longlong3, signed long long)
+  #endif
+  signed long long x, y, z;
 };
 
-struct __attribute__((aligned(64))) longlong4
-{
-    long long int x, y, z, w;
-};
+struct longlong4 {
+  #ifdef __cplusplus
+    public:
+    MAKE_DEFAULT_CONSTRUCTOR_FOUR_COMPONENT(longlong4)
 
-struct __attribute__((aligned(64))) ulonglong4
-{
-    unsigned long long int x, y, z, w;
-};
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed char)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed short)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed int)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, float)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, double)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, unsigned long long)
+    MAKE_COMPONENT_CONSTRUCTOR_FOUR_COMPONENT(longlong4, signed long long)
+  #endif
+  signed long x, y, z, w;
+} __attribute__((aligned(32)));
 
-struct double1
-{
-    double x;
-};
+#define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
+__device__ __host__ inline type make_##type(comp x) { \
+  type ret; \
+  ret.x = x; \
+  return ret; \
+}
 
-struct __attribute__((aligned(16))) double2
-{
-    double x, y;
-};
+#define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
+__device__ __host__ inline type make_##type(comp x, comp y) { \
+  type ret; \
+  ret.x = x; \
+  ret.y = y; \
+  return ret; \
+}
 
-struct double3
-{
-    double x, y, z;
-};
+#define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
+__device__ __host__ inline type make_##type(comp x, comp y, comp z) { \
+  type ret; \
+  ret.x = x; \
+  ret.y = y; \
+  ret.z = z; \
+  return ret; \
+}
 
-struct __attribute__((aligned(32))) double4
-{
-    double x, y, z, w;
-};
+#define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
+__device__ __host__ inline type make_##type(comp x, comp y, comp z, comp w) { \
+  type ret; \
+  ret.x = x; \
+  ret.y = y; \
+  ret.z = z; \
+  ret.w = w; \
+  return ret; \
+}
 
-#endif
+DECLOP_MAKE_ONE_COMPONENT(unsigned char, uchar1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned char, uchar2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned char, uchar3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned char, uchar4);
 
-#if __HCC__
-#include"hip/hcc_detail/host_defines.h"
-#define __HIP_DEVICE__ __device__ __host__
-#else
-#define __HIP_DEVICE__
-#endif
+DECLOP_MAKE_ONE_COMPONENT(signed char, char1);
+DECLOP_MAKE_TWO_COMPONENT(signed char, char2);
+DECLOP_MAKE_THREE_COMPONENT(signed char, char3);
+DECLOP_MAKE_FOUR_COMPONENT(signed char, char4);
 
-__HIP_DEVICE__ char1 make_char1(signed char );
-__HIP_DEVICE__ char2 make_char2(signed char, signed char );
-__HIP_DEVICE__ char3 make_char3(signed char, signed char, signed char );
-__HIP_DEVICE__ char4 make_char4(signed char, signed char, signed char, signed char );
+DECLOP_MAKE_ONE_COMPONENT(unsigned short, ushort1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned short, ushort2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned short, ushort3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned short, ushort4);
 
-__HIP_DEVICE__ short1 make_short1(short );
-__HIP_DEVICE__ short2 make_short2(short, short );
-__HIP_DEVICE__ short3 make_short3(short, short, short );
-__HIP_DEVICE__ short4 make_short4(short, short, short, short );
+DECLOP_MAKE_ONE_COMPONENT(signed short, short1);
+DECLOP_MAKE_TWO_COMPONENT(signed short, short2);
+DECLOP_MAKE_THREE_COMPONENT(signed short, short3);
+DECLOP_MAKE_FOUR_COMPONENT(signed short, short4);
 
-__HIP_DEVICE__ int1 make_int1(int );
-__HIP_DEVICE__ int2 make_int2(int, int );
-__HIP_DEVICE__ int3 make_int3(int, int, int );
-__HIP_DEVICE__ int4 make_int4(int, int, int, int );
+DECLOP_MAKE_ONE_COMPONENT(unsigned int, uint1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned int, uint2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned int, uint3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned int, uint4);
 
-__HIP_DEVICE__ long1 make_long1(long );
-__HIP_DEVICE__ long2 make_long2(long, long );
-__HIP_DEVICE__ long3 make_long3(long, long, long );
-__HIP_DEVICE__ long4 make_long4(long, long, long, long );
+DECLOP_MAKE_ONE_COMPONENT(signed int, int1);
+DECLOP_MAKE_TWO_COMPONENT(signed int, int2);
+DECLOP_MAKE_THREE_COMPONENT(signed int, int3);
+DECLOP_MAKE_FOUR_COMPONENT(signed int, int4);
 
-__HIP_DEVICE__ longlong1 make_longlong1(long long );
-__HIP_DEVICE__ longlong2 make_longlong2(long long, long long );
-__HIP_DEVICE__ longlong3 make_longlong3(long long, long long, long long );
-__HIP_DEVICE__ longlong4 make_longlong4(long long, long long, long long, long long );
+DECLOP_MAKE_ONE_COMPONENT(float, float1);
+DECLOP_MAKE_TWO_COMPONENT(float, float2);
+DECLOP_MAKE_THREE_COMPONENT(float, float3);
+DECLOP_MAKE_FOUR_COMPONENT(float, float4);
 
-__HIP_DEVICE__ uchar1 make_uchar1(unsigned char );
-__HIP_DEVICE__ uchar2 make_uchar2(unsigned char, unsigned char );
-__HIP_DEVICE__ uchar3 make_uchar3(unsigned char, unsigned char, unsigned char );
-__HIP_DEVICE__ uchar4 make_uchar4(unsigned char, unsigned char, unsigned char, unsigned char );
+DECLOP_MAKE_ONE_COMPONENT(double, double1);
+DECLOP_MAKE_TWO_COMPONENT(double, double2);
+DECLOP_MAKE_THREE_COMPONENT(double, double3);
+DECLOP_MAKE_FOUR_COMPONENT(double, double4);
 
-__HIP_DEVICE__ ushort1 make_ushort1(unsigned short );
-__HIP_DEVICE__ ushort2 make_ushort2(unsigned short, unsigned short );
-__HIP_DEVICE__ ushort3 make_ushort3(unsigned short, unsigned short, unsigned short );
-__HIP_DEVICE__ ushort4 make_ushort4(unsigned short, unsigned short, unsigned short, unsigned short );
+DECLOP_MAKE_ONE_COMPONENT(unsigned long, ulong1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned long, ulong2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned long, ulong3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned long, ulong4);
 
-__HIP_DEVICE__ uint1 make_uint1(unsigned int );
-__HIP_DEVICE__ uint2 make_uint2(unsigned int, unsigned int );
-__HIP_DEVICE__ uint3 make_uint3(unsigned int, unsigned int, unsigned int );
-__HIP_DEVICE__ uint4 make_uint4(unsigned int, unsigned int, unsigned int, unsigned int );
+DECLOP_MAKE_ONE_COMPONENT(signed long, long1);
+DECLOP_MAKE_TWO_COMPONENT(signed long, long2);
+DECLOP_MAKE_THREE_COMPONENT(signed long, long3);
+DECLOP_MAKE_FOUR_COMPONENT(signed long, long4);
 
-__HIP_DEVICE__ ulong1 make_ulong1(unsigned long );
-__HIP_DEVICE__ ulong2 make_ulong2(unsigned long, unsigned long );
-__HIP_DEVICE__ ulong3 make_ulong3(unsigned long, unsigned long, unsigned long );
-__HIP_DEVICE__ ulong4 make_ulong4(unsigned long, unsigned long, unsigned long, unsigned long );
+DECLOP_MAKE_ONE_COMPONENT(unsigned long, ulonglong1);
+DECLOP_MAKE_TWO_COMPONENT(unsigned long, ulonglong2);
+DECLOP_MAKE_THREE_COMPONENT(unsigned long, ulonglong3);
+DECLOP_MAKE_FOUR_COMPONENT(unsigned long, ulonglong4);
 
-__HIP_DEVICE__ ulonglong1 make_ulonglong1(unsigned long long );
-__HIP_DEVICE__ ulonglong2 make_ulonglong2(unsigned long long, unsigned long long);
-__HIP_DEVICE__ ulonglong3 make_ulonglong3(unsigned long long, unsigned long long, unsigned long long);
-__HIP_DEVICE__ ulonglong4 make_ulonglong4(unsigned long long, unsigned long long, unsigned long long, unsigned long long );
+DECLOP_MAKE_ONE_COMPONENT(signed long, longlong1);
+DECLOP_MAKE_TWO_COMPONENT(signed long, longlong2);
+DECLOP_MAKE_THREE_COMPONENT(signed long, longlong3);
+DECLOP_MAKE_FOUR_COMPONENT(signed long, longlong4);
 
-__HIP_DEVICE__ float1 make_float1(float );
-__HIP_DEVICE__ float2 make_float2(float, float );
-__HIP_DEVICE__ float3 make_float3(float, float, float );
-__HIP_DEVICE__ float4 make_float4(float, float, float, float );
 
-__HIP_DEVICE__ double1 make_double1(double );
-__HIP_DEVICE__ double2 make_double2(double, double );
-__HIP_DEVICE__ double3 make_double3(double, double, double );
-__HIP_DEVICE__ double4 make_double4(double, double, double, double );
+#if __cplusplus
+
+#define DECLOP_1VAR_2IN_1OUT(type, op) \
+__device__ __host__ type operator op (const type& lhs, const type& rhs) { \
+  type ret; \
+  ret.x = lhs.x op rhs.x; \
+  return ret; \
+}
+
+#define DECLOP_1VAR_SCALE_PRODUCT(type, type1) \
+__device__ __host__ type operator * (const type& lhs, type1 rhs) { \
+  type ret; \
+  ret.x = lhs.x * rhs; \
+  return ret; \
+} \
+\
+__device__ __host__ type operator * (type1 lhs, const type& rhs) { \
+  type ret; \
+  ret.x = lhs * rhs.x; \
+  return ret; \
+}
+
+#define DECLOP_1VAR_ASSIGN(type, op) \
+__device__ __host__ inline type& operator op ( type& lhs, const type& rhs) { \
+  lhs.x op rhs.x; \
+  return lhs; \
+}
+
+#define DECLOP_1VAR_PREOP(type, op) \
+__device__ __host__ inline type& operator op (type& val) { \
+  op val.x; \
+  return val; \
+}
+
+#define DECLOP_1VAR_POSTOP(type, op) \
+__device__ __host__ type operator op (type& val, int i) { \
+  type ret; \
+  ret.x = val.x; \
+  val.x op; \
+  return ret; \
+}
+
+#define DECLOP_1VAR_COMP(type, op) \
+__device__ __host__ inline bool operator op (type& lhs, type& rhs) { \
+  return lhs.x op rhs.x; \
+}
+
+#define DECLOP_1VAR_1IN_1OUT(type, op) \
+__device__ __host__ type operator op(type& rhs) { \
+  type ret; \
+  ret.x = op rhs.x; \
+  return ret; \
+}
+
+#define DECLOP_1VAR_1IN_BOOLOUT(type, op) \
+__device__ __host__ inline bool operator op (type& rhs) { \
+  return op rhs.x; \
+}
+
+/*
+ Two Element Access
+*/
+
+#define DECLOP_2VAR_2IN_1OUT(type, op) \
+__device__ __host__ type operator op (const type& lhs, const type& rhs) { \
+  type ret; \
+  ret.x = lhs.x op rhs.x; \
+  ret.y = lhs.y op rhs.y; \
+  return ret; \
+}
+
+#define DECLOP_2VAR_SCALE_PRODUCT(type, type1) \
+__device__ __host__ type operator * (const type& lhs, type1 rhs) { \
+  type ret; \
+  ret.x = lhs.x * rhs; \
+  ret.y = lhs.y * rhs; \
+  return ret; \
+} \
+\
+__device__ __host__ type operator * (type1 lhs, const type& rhs) { \
+  type ret; \
+  ret.x = lhs * rhs.x; \
+  ret.y = lhs * rhs.y; \
+  return ret; \
+}
+
+#define DECLOP_2VAR_ASSIGN(type, op) \
+__device__ __host__ inline type& operator op ( type& lhs, const type& rhs) { \
+  lhs.x op rhs.x; \
+  lhs.y op rhs.y; \
+  return lhs; \
+}
+
+#define DECLOP_2VAR_PREOP(type, op) \
+__device__ __host__ inline type& operator op (type& val) { \
+  op val.x; \
+  op val.y; \
+  return val; \
+}
+
+#define DECLOP_2VAR_POSTOP(type, op) \
+__device__ __host__ type operator op (type& val, int i) { \
+  type ret; \
+  ret.x = val.x; \
+  ret.y = val.y; \
+  val.x op; \
+  val.y op; \
+  return ret; \
+}
+
+#define DECLOP_2VAR_COMP(type, op) \
+__device__ __host__ inline bool operator op (type& lhs, type& rhs) { \
+  return lhs.x op rhs.x && lhs.y op rhs.y; \
+}
+
+#define DECLOP_2VAR_1IN_1OUT(type, op) \
+__device__ __host__ type operator op(type &rhs) { \
+  type ret; \
+  ret.x = op rhs.x; \
+  ret.y = op rhs.y; \
+  return ret; \
+}
+
+#define DECLOP_2VAR_1IN_BOOLOUT(type, op) \
+__device__ __host__ inline bool operator op (type &rhs) { \
+  return op rhs.x && op rhs.y; \
+}
 
 
 /*
-///---
-// Inline functions for creating vector types from basic types
-#define ONE_COMPONENT_ACCESS(T, VT) inline VT make_ ##VT [[hc]] [[cpu]] (T x) { VT t; t.x = x; return t; };
-#define TWO_COMPONENT_ACCESS(T, VT) inline VT make_ ##VT [[hc]] [[cpu]] (T x, T y) { VT t; t.x=x; t.y=y; return t; };
-#define THREE_COMPONENT_ACCESS(T, VT) inline VT make_ ##VT [[hc]] [[cpu]] (T x, T y, T z) { VT t; t.x=x; t.y=y; t.z=z; return t; };
-#define FOUR_COMPONENT_ACCESS(T, VT) inline VT make_ ##VT [[hc]] [[cpu]] (T x, T y, T z, T w) { VT t; t.x=x; t.y=y; t.z=z; t.w=w; return t; };
-
-
-//signed:
-ONE_COMPONENT_ACCESS  (signed char, char1);
-TWO_COMPONENT_ACCESS  (signed char, char2);
-THREE_COMPONENT_ACCESS(signed char, char3);
-FOUR_COMPONENT_ACCESS (signed char, char4);
-
-ONE_COMPONENT_ACCESS  (short, short1);
-TWO_COMPONENT_ACCESS  (short, short2);
-THREE_COMPONENT_ACCESS(short, short3);
-FOUR_COMPONENT_ACCESS (short, short4);
-
-ONE_COMPONENT_ACCESS  (int, int1);
-TWO_COMPONENT_ACCESS  (int, int2);
-THREE_COMPONENT_ACCESS(int, int3);
-FOUR_COMPONENT_ACCESS (int, int4);
-
-ONE_COMPONENT_ACCESS  (long int, long1);
-TWO_COMPONENT_ACCESS  (long int, long2);
-THREE_COMPONENT_ACCESS(long int, long3);
-FOUR_COMPONENT_ACCESS (long int, long4);
-
-ONE_COMPONENT_ACCESS  (long long int, ulong1);
-TWO_COMPONENT_ACCESS  (long long int, ulong2);
-THREE_COMPONENT_ACCESS(long long int, ulong3);
-FOUR_COMPONENT_ACCESS (long long int, ulong4);
-
-ONE_COMPONENT_ACCESS  (long long int, longlong1);
-TWO_COMPONENT_ACCESS  (long long int, longlong2);
-THREE_COMPONENT_ACCESS(long long int, longlong3);
-FOUR_COMPONENT_ACCESS (long long int, longlong4);
-
-
-// unsigned:
-ONE_COMPONENT_ACCESS  (unsigned char, uchar1);
-TWO_COMPONENT_ACCESS  (unsigned char, uchar2);
-THREE_COMPONENT_ACCESS(unsigned char, uchar3);
-FOUR_COMPONENT_ACCESS (unsigned char, uchar4);
-
-ONE_COMPONENT_ACCESS  (unsigned short, ushort1);
-TWO_COMPONENT_ACCESS  (unsigned short, ushort2);
-THREE_COMPONENT_ACCESS(unsigned short, ushort3);
-FOUR_COMPONENT_ACCESS (unsigned short, ushort4);
-
-ONE_COMPONENT_ACCESS  (unsigned int, uint1);
-TWO_COMPONENT_ACCESS  (unsigned int, uint2);
-THREE_COMPONENT_ACCESS(unsigned int, uint3);
-FOUR_COMPONENT_ACCESS (unsigned int, uint4);
-
-ONE_COMPONENT_ACCESS  (unsigned long int, ulong1);
-TWO_COMPONENT_ACCESS  (unsigned long int, ulong2);
-THREE_COMPONENT_ACCESS(unsigned long int, ulong3);
-FOUR_COMPONENT_ACCESS (unsigned long int, ulong4);
-
-ONE_COMPONENT_ACCESS  (unsigned long long int, ulong1);
-TWO_COMPONENT_ACCESS  (unsigned long long int, ulong2);
-THREE_COMPONENT_ACCESS(unsigned long long int, ulong3);
-FOUR_COMPONENT_ACCESS (unsigned long long int, ulong4);
-
-ONE_COMPONENT_ACCESS  (unsigned long long int, ulonglong1);
-TWO_COMPONENT_ACCESS  (unsigned long long int, ulonglong2);
-THREE_COMPONENT_ACCESS(unsigned long long int, ulonglong3);
-FOUR_COMPONENT_ACCESS (unsigned long long int, ulonglong4);
-
-
-//Floating point
-ONE_COMPONENT_ACCESS  (float, float1);
-TWO_COMPONENT_ACCESS  (float, float2);
-THREE_COMPONENT_ACCESS(float, float3);
-FOUR_COMPONENT_ACCESS (float, float4);
-
-ONE_COMPONENT_ACCESS  (double, double1);
-TWO_COMPONENT_ACCESS  (double, double2);
-THREE_COMPONENT_ACCESS(double, double3);
-FOUR_COMPONENT_ACCESS (double, double4);
+ Three Element Access
 */
 
+#define DECLOP_3VAR_2IN_1OUT(type, op) \
+__device__ __host__ type operator op (const type& lhs, const type& rhs) { \
+  type ret; \
+  ret.x = lhs.x op rhs.x; \
+  ret.y = lhs.y op rhs.y; \
+  ret.z = lhs.z op rhs.z; \
+  return ret; \
+}
+
+#define DECLOP_3VAR_SCALE_PRODUCT(type, type1) \
+__device__ __host__ type operator * (const type& lhs, type1 rhs) { \
+  type ret; \
+  ret.x = lhs.x * rhs; \
+  ret.y = lhs.y * rhs; \
+  ret.z = lhs.z * rhs; \
+  return ret; \
+} \
+\
+__device__ __host__ type operator * (type1 lhs, const type& rhs) { \
+  type ret; \
+  ret.x = lhs * rhs.x; \
+  ret.y = lhs * rhs.y; \
+  ret.z = lhs * rhs.z; \
+  return ret; \
+}
+
+#define DECLOP_3VAR_ASSIGN(type, op) \
+__device__ __host__ inline type& operator op ( type& lhs, const type& rhs) { \
+  lhs.x op rhs.x; \
+  lhs.y op rhs.y; \
+  lhs.z op rhs.z; \
+  return lhs; \
+}
+
+#define DECLOP_3VAR_PREOP(type, op) \
+__device__ __host__ inline type& operator op (type& val) { \
+  op val.x; \
+  op val.y; \
+  op val.z; \
+  return val; \
+}
+
+#define DECLOP_3VAR_POSTOP(type, op) \
+__device__ __host__ type operator op (type& val, int i) { \
+  type ret; \
+  ret.x = val.x; \
+  ret.y = val.y; \
+  ret.z = val.z; \
+  val.x op; \
+  val.y op; \
+  val.z op; \
+  return ret; \
+}
+
+#define DECLOP_3VAR_COMP(type, op) \
+__device__ __host__ inline bool operator op (type& lhs, type& rhs) { \
+  return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z; \
+}
+
+#define DECLOP_3VAR_1IN_1OUT(type, op) \
+__device__ __host__ type operator op(type &rhs) { \
+  type ret; \
+  ret.x = op rhs.x; \
+  ret.y = op rhs.y; \
+  ret.z = op rhs.z; \
+  return ret; \
+}
+
+#define DECLOP_3VAR_1IN_BOOLOUT(type, op) \
+__device__ __host__ inline bool operator op (type &rhs) { \
+  return op rhs.x && op rhs.y && op rhs.z; \
+}
+
+
+/*
+ Four Element Access
+*/
+
+#define DECLOP_4VAR_2IN_1OUT(type, op) \
+__device__ __host__ type operator op ( const type& lhs, const type& rhs) { \
+  type ret; \
+  ret.x = lhs.x op rhs.x; \
+  ret.y = lhs.y op rhs.y; \
+  ret.z = lhs.z op rhs.z; \
+  ret.w = lhs.w op rhs.w; \
+  return ret; \
+}
+
+#define DECLOP_4VAR_SCALE_PRODUCT(type, type1) \
+__device__ __host__ type operator * (const type& lhs, type1 rhs) { \
+  type ret; \
+  ret.x = lhs.x * rhs; \
+  ret.y = lhs.y * rhs; \
+  ret.z = lhs.z * rhs; \
+  ret.w = lhs.w * rhs; \
+  return ret; \
+} \
+\
+__device__ __host__ type operator * (type1 lhs, const type& rhs) { \
+  type ret; \
+  ret.x = lhs * rhs.x; \
+  ret.y = lhs * rhs.y; \
+  ret.z = lhs * rhs.z; \
+  ret.w = lhs * rhs.w; \
+  return ret; \
+}
+
+#define DECLOP_4VAR_ASSIGN(type, op) \
+__device__ __host__ inline type& operator op ( type& lhs, const type& rhs) { \
+  lhs.x op rhs.x; \
+  lhs.y op rhs.y; \
+  lhs.z op rhs.z; \
+  lhs.w op rhs.w; \
+  return lhs; \
+}
+
+#define DECLOP_4VAR_PREOP(type, op) \
+__device__ __host__ inline type& operator op (type& val) { \
+  op val.x; \
+  op val.y; \
+  op val.z; \
+  op val.w; \
+  return val; \
+}
+
+#define DECLOP_4VAR_POSTOP(type, op) \
+__device__ __host__ type operator op (type& val, int i) { \
+  type ret; \
+  ret.x = val.x; \
+  ret.y = val.y; \
+  ret.z = val.z; \
+  ret.w = val.w; \
+  val.x op; \
+  val.y op; \
+  val.z op; \
+  val.w op; \
+  return ret; \
+}
+
+#define DECLOP_4VAR_COMP(type, op) \
+__device__ __host__ inline bool operator op (type& lhs, type& rhs) { \
+  return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z && lhs.w op rhs.w; \
+}
+
+#define DECLOP_4VAR_1IN_1OUT(type, op) \
+__device__ __host__ type operator op(type &rhs) { \
+  type ret; \
+  ret.x = op rhs.x; \
+  ret.y = op rhs.y; \
+  ret.z = op rhs.z; \
+  ret.w = op rhs.w; \
+  return ret; \
+}
+
+#define DECLOP_4VAR_1IN_BOOLOUT(type, op) \
+__device__ __host__ inline bool operator op (type &rhs) { \
+  return op rhs.x && op rhs.y && op rhs.z && op rhs.w; \
+}
+
+
+/*
+Overloading operators
+*/
+
+// UNSIGNED CHAR1
+
+DECLOP_1VAR_2IN_1OUT(uchar1, +)
+DECLOP_1VAR_2IN_1OUT(uchar1, -)
+DECLOP_1VAR_2IN_1OUT(uchar1, *)
+DECLOP_1VAR_2IN_1OUT(uchar1, /)
+DECLOP_1VAR_2IN_1OUT(uchar1, %)
+DECLOP_1VAR_2IN_1OUT(uchar1, &)
+DECLOP_1VAR_2IN_1OUT(uchar1, |)
+DECLOP_1VAR_2IN_1OUT(uchar1, ^)
+DECLOP_1VAR_2IN_1OUT(uchar1, <<)
+DECLOP_1VAR_2IN_1OUT(uchar1, >>)
+
+
+DECLOP_1VAR_ASSIGN(uchar1, +=)
+DECLOP_1VAR_ASSIGN(uchar1, -=)
+DECLOP_1VAR_ASSIGN(uchar1, *=)
+DECLOP_1VAR_ASSIGN(uchar1, /=)
+DECLOP_1VAR_ASSIGN(uchar1, %=)
+DECLOP_1VAR_ASSIGN(uchar1, &=)
+DECLOP_1VAR_ASSIGN(uchar1, |=)
+DECLOP_1VAR_ASSIGN(uchar1, ^=)
+DECLOP_1VAR_ASSIGN(uchar1, <<=)
+DECLOP_1VAR_ASSIGN(uchar1, >>=)
+
+DECLOP_1VAR_PREOP(uchar1, ++)
+DECLOP_1VAR_PREOP(uchar1, --)
+
+DECLOP_1VAR_POSTOP(uchar1, ++)
+DECLOP_1VAR_POSTOP(uchar1, --)
+
+DECLOP_1VAR_COMP(uchar1, ==)
+DECLOP_1VAR_COMP(uchar1, !=)
+DECLOP_1VAR_COMP(uchar1, <)
+DECLOP_1VAR_COMP(uchar1, >)
+DECLOP_1VAR_COMP(uchar1, <=)
+DECLOP_1VAR_COMP(uchar1, >=)
+
+DECLOP_1VAR_COMP(uchar1, &&)
+DECLOP_1VAR_COMP(uchar1, ||)
+
+DECLOP_1VAR_1IN_1OUT(uchar1, ~)
+DECLOP_1VAR_1IN_BOOLOUT(uchar1, !)
+
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, float)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, double)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(uchar1, signed long long)
+
+// UNSIGNED CHAR2
+
+DECLOP_2VAR_2IN_1OUT(uchar2, +)
+DECLOP_2VAR_2IN_1OUT(uchar2, -)
+DECLOP_2VAR_2IN_1OUT(uchar2, *)
+DECLOP_2VAR_2IN_1OUT(uchar2, /)
+DECLOP_2VAR_2IN_1OUT(uchar2, %)
+DECLOP_2VAR_2IN_1OUT(uchar2, &)
+DECLOP_2VAR_2IN_1OUT(uchar2, |)
+DECLOP_2VAR_2IN_1OUT(uchar2, ^)
+DECLOP_2VAR_2IN_1OUT(uchar2, <<)
+DECLOP_2VAR_2IN_1OUT(uchar2, >>)
+
+DECLOP_2VAR_ASSIGN(uchar2, +=)
+DECLOP_2VAR_ASSIGN(uchar2, -=)
+DECLOP_2VAR_ASSIGN(uchar2, *=)
+DECLOP_2VAR_ASSIGN(uchar2, /=)
+DECLOP_2VAR_ASSIGN(uchar2, %=)
+DECLOP_2VAR_ASSIGN(uchar2, &=)
+DECLOP_2VAR_ASSIGN(uchar2, |=)
+DECLOP_2VAR_ASSIGN(uchar2, ^=)
+DECLOP_2VAR_ASSIGN(uchar2, <<=)
+DECLOP_2VAR_ASSIGN(uchar2, >>=)
+
+DECLOP_2VAR_PREOP(uchar2, ++)
+DECLOP_2VAR_PREOP(uchar2, --)
+
+DECLOP_2VAR_POSTOP(uchar2, ++)
+DECLOP_2VAR_POSTOP(uchar2, --)
+
+DECLOP_2VAR_COMP(uchar2, ==)
+DECLOP_2VAR_COMP(uchar2, !=)
+DECLOP_2VAR_COMP(uchar2, <)
+DECLOP_2VAR_COMP(uchar2, >)
+DECLOP_2VAR_COMP(uchar2, <=)
+DECLOP_2VAR_COMP(uchar2, >=)
+
+DECLOP_2VAR_COMP(uchar2, &&)
+DECLOP_2VAR_COMP(uchar2, ||)
+
+DECLOP_2VAR_1IN_1OUT(uchar2, ~)
+DECLOP_2VAR_1IN_BOOLOUT(uchar2, !)
+
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, float)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, double)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(uchar2, signed long long)
+
+// UNSIGNED CHAR3
+
+DECLOP_3VAR_2IN_1OUT(uchar3, +)
+DECLOP_3VAR_2IN_1OUT(uchar3, -)
+DECLOP_3VAR_2IN_1OUT(uchar3, *)
+DECLOP_3VAR_2IN_1OUT(uchar3, /)
+DECLOP_3VAR_2IN_1OUT(uchar3, %)
+DECLOP_3VAR_2IN_1OUT(uchar3, &)
+DECLOP_3VAR_2IN_1OUT(uchar3, |)
+DECLOP_3VAR_2IN_1OUT(uchar3, ^)
+DECLOP_3VAR_2IN_1OUT(uchar3, <<)
+DECLOP_3VAR_2IN_1OUT(uchar3, >>)
+
+DECLOP_3VAR_ASSIGN(uchar3, +=)
+DECLOP_3VAR_ASSIGN(uchar3, -=)
+DECLOP_3VAR_ASSIGN(uchar3, *=)
+DECLOP_3VAR_ASSIGN(uchar3, /=)
+DECLOP_3VAR_ASSIGN(uchar3, %=)
+DECLOP_3VAR_ASSIGN(uchar3, &=)
+DECLOP_3VAR_ASSIGN(uchar3, |=)
+DECLOP_3VAR_ASSIGN(uchar3, ^=)
+DECLOP_3VAR_ASSIGN(uchar3, <<=)
+DECLOP_3VAR_ASSIGN(uchar3, >>=)
+
+DECLOP_3VAR_PREOP(uchar3, ++)
+DECLOP_3VAR_PREOP(uchar3, --)
+
+DECLOP_3VAR_POSTOP(uchar3, ++)
+DECLOP_3VAR_POSTOP(uchar3, --)
+
+DECLOP_3VAR_COMP(uchar3, ==)
+DECLOP_3VAR_COMP(uchar3, !=)
+DECLOP_3VAR_COMP(uchar3, <)
+DECLOP_3VAR_COMP(uchar3, >)
+DECLOP_3VAR_COMP(uchar3, <=)
+DECLOP_3VAR_COMP(uchar3, >=)
+
+DECLOP_3VAR_COMP(uchar3, &&)
+DECLOP_3VAR_COMP(uchar3, ||)
+
+DECLOP_3VAR_1IN_1OUT(uchar3, ~)
+DECLOP_3VAR_1IN_BOOLOUT(uchar3, !)
+
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, float)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, double)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(uchar3, signed long long)
+
+// UNSIGNED CHAR4
+
+DECLOP_4VAR_2IN_1OUT(uchar4, +)
+DECLOP_4VAR_2IN_1OUT(uchar4, -)
+DECLOP_4VAR_2IN_1OUT(uchar4, *)
+DECLOP_4VAR_2IN_1OUT(uchar4, /)
+DECLOP_4VAR_2IN_1OUT(uchar4, %)
+DECLOP_4VAR_2IN_1OUT(uchar4, &)
+DECLOP_4VAR_2IN_1OUT(uchar4, |)
+DECLOP_4VAR_2IN_1OUT(uchar4, ^)
+DECLOP_4VAR_2IN_1OUT(uchar4, <<)
+DECLOP_4VAR_2IN_1OUT(uchar4, >>)
+
+DECLOP_4VAR_ASSIGN(uchar4, +=)
+DECLOP_4VAR_ASSIGN(uchar4, -=)
+DECLOP_4VAR_ASSIGN(uchar4, *=)
+DECLOP_4VAR_ASSIGN(uchar4, /=)
+DECLOP_4VAR_ASSIGN(uchar4, %=)
+DECLOP_4VAR_ASSIGN(uchar4, &=)
+DECLOP_4VAR_ASSIGN(uchar4, |=)
+DECLOP_4VAR_ASSIGN(uchar4, ^=)
+DECLOP_4VAR_ASSIGN(uchar4, <<=)
+DECLOP_4VAR_ASSIGN(uchar4, >>=)
+
+DECLOP_4VAR_PREOP(uchar4, ++)
+DECLOP_4VAR_PREOP(uchar4, --)
+
+DECLOP_4VAR_POSTOP(uchar4, ++)
+DECLOP_4VAR_POSTOP(uchar4, --)
+
+DECLOP_4VAR_COMP(uchar4, ==)
+DECLOP_4VAR_COMP(uchar4, !=)
+DECLOP_4VAR_COMP(uchar4, <)
+DECLOP_4VAR_COMP(uchar4, >)
+DECLOP_4VAR_COMP(uchar4, <=)
+DECLOP_4VAR_COMP(uchar4, >=)
+
+DECLOP_4VAR_COMP(uchar4, &&)
+DECLOP_4VAR_COMP(uchar4, ||)
+
+DECLOP_4VAR_1IN_1OUT(uchar4, ~)
+DECLOP_4VAR_1IN_BOOLOUT(uchar4, !)
+
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, float)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, double)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(uchar4, signed long long)
+
+// SIGNED CHAR1
+
+DECLOP_1VAR_2IN_1OUT(char1, +)
+DECLOP_1VAR_2IN_1OUT(char1, -)
+DECLOP_1VAR_2IN_1OUT(char1, *)
+DECLOP_1VAR_2IN_1OUT(char1, /)
+DECLOP_1VAR_2IN_1OUT(char1, %)
+DECLOP_1VAR_2IN_1OUT(char1, &)
+DECLOP_1VAR_2IN_1OUT(char1, |)
+DECLOP_1VAR_2IN_1OUT(char1, ^)
+DECLOP_1VAR_2IN_1OUT(char1, <<)
+DECLOP_1VAR_2IN_1OUT(char1, >>)
+
+
+DECLOP_1VAR_ASSIGN(char1, +=)
+DECLOP_1VAR_ASSIGN(char1, -=)
+DECLOP_1VAR_ASSIGN(char1, *=)
+DECLOP_1VAR_ASSIGN(char1, /=)
+DECLOP_1VAR_ASSIGN(char1, %=)
+DECLOP_1VAR_ASSIGN(char1, &=)
+DECLOP_1VAR_ASSIGN(char1, |=)
+DECLOP_1VAR_ASSIGN(char1, ^=)
+DECLOP_1VAR_ASSIGN(char1, <<=)
+DECLOP_1VAR_ASSIGN(char1, >>=)
+
+DECLOP_1VAR_PREOP(char1, ++)
+DECLOP_1VAR_PREOP(char1, --)
+
+DECLOP_1VAR_POSTOP(char1, ++)
+DECLOP_1VAR_POSTOP(char1, --)
+
+DECLOP_1VAR_COMP(char1, ==)
+DECLOP_1VAR_COMP(char1, !=)
+DECLOP_1VAR_COMP(char1, <)
+DECLOP_1VAR_COMP(char1, >)
+DECLOP_1VAR_COMP(char1, <=)
+DECLOP_1VAR_COMP(char1, >=)
+
+DECLOP_1VAR_COMP(char1, &&)
+DECLOP_1VAR_COMP(char1, ||)
+
+DECLOP_1VAR_1IN_1OUT(char1, ~)
+DECLOP_1VAR_1IN_BOOLOUT(char1, !)
+
+DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(char1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(char1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(char1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(char1, float)
+DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(char1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(char1, double)
+DECLOP_1VAR_SCALE_PRODUCT(char1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(char1, signed long long)
+
+// SIGNED CHAR2
+
+DECLOP_2VAR_2IN_1OUT(char2, +)
+DECLOP_2VAR_2IN_1OUT(char2, -)
+DECLOP_2VAR_2IN_1OUT(char2, *)
+DECLOP_2VAR_2IN_1OUT(char2, /)
+DECLOP_2VAR_2IN_1OUT(char2, %)
+DECLOP_2VAR_2IN_1OUT(char2, &)
+DECLOP_2VAR_2IN_1OUT(char2, |)
+DECLOP_2VAR_2IN_1OUT(char2, ^)
+DECLOP_2VAR_2IN_1OUT(char2, <<)
+DECLOP_2VAR_2IN_1OUT(char2, >>)
+
+DECLOP_2VAR_ASSIGN(char2, +=)
+DECLOP_2VAR_ASSIGN(char2, -=)
+DECLOP_2VAR_ASSIGN(char2, *=)
+DECLOP_2VAR_ASSIGN(char2, /=)
+DECLOP_2VAR_ASSIGN(char2, %=)
+DECLOP_2VAR_ASSIGN(char2, &=)
+DECLOP_2VAR_ASSIGN(char2, |=)
+DECLOP_2VAR_ASSIGN(char2, ^=)
+DECLOP_2VAR_ASSIGN(char2, <<=)
+DECLOP_2VAR_ASSIGN(char2, >>=)
+
+DECLOP_2VAR_PREOP(char2, ++)
+DECLOP_2VAR_PREOP(char2, --)
+
+DECLOP_2VAR_POSTOP(char2, ++)
+DECLOP_2VAR_POSTOP(char2, --)
+
+DECLOP_2VAR_COMP(char2, ==)
+DECLOP_2VAR_COMP(char2, !=)
+DECLOP_2VAR_COMP(char2, <)
+DECLOP_2VAR_COMP(char2, >)
+DECLOP_2VAR_COMP(char2, <=)
+DECLOP_2VAR_COMP(char2, >=)
+
+DECLOP_2VAR_COMP(char2, &&)
+DECLOP_2VAR_COMP(char2, ||)
+
+DECLOP_2VAR_1IN_1OUT(char2, ~)
+DECLOP_2VAR_1IN_BOOLOUT(char2, !)
+
+DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(char2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(char2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(char2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(char2, float)
+DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(char2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(char2, double)
+DECLOP_2VAR_SCALE_PRODUCT(char2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(char2, signed long long)
+
+// SIGNED CHAR3
+
+DECLOP_3VAR_2IN_1OUT(char3, +)
+DECLOP_3VAR_2IN_1OUT(char3, -)
+DECLOP_3VAR_2IN_1OUT(char3, *)
+DECLOP_3VAR_2IN_1OUT(char3, /)
+DECLOP_3VAR_2IN_1OUT(char3, %)
+DECLOP_3VAR_2IN_1OUT(char3, &)
+DECLOP_3VAR_2IN_1OUT(char3, |)
+DECLOP_3VAR_2IN_1OUT(char3, ^)
+DECLOP_3VAR_2IN_1OUT(char3, <<)
+DECLOP_3VAR_2IN_1OUT(char3, >>)
+
+DECLOP_3VAR_ASSIGN(char3, +=)
+DECLOP_3VAR_ASSIGN(char3, -=)
+DECLOP_3VAR_ASSIGN(char3, *=)
+DECLOP_3VAR_ASSIGN(char3, /=)
+DECLOP_3VAR_ASSIGN(char3, %=)
+DECLOP_3VAR_ASSIGN(char3, &=)
+DECLOP_3VAR_ASSIGN(char3, |=)
+DECLOP_3VAR_ASSIGN(char3, ^=)
+DECLOP_3VAR_ASSIGN(char3, <<=)
+DECLOP_3VAR_ASSIGN(char3, >>=)
+
+DECLOP_3VAR_PREOP(char3, ++)
+DECLOP_3VAR_PREOP(char3, --)
+
+DECLOP_3VAR_POSTOP(char3, ++)
+DECLOP_3VAR_POSTOP(char3, --)
+
+DECLOP_3VAR_COMP(char3, ==)
+DECLOP_3VAR_COMP(char3, !=)
+DECLOP_3VAR_COMP(char3, <)
+DECLOP_3VAR_COMP(char3, >)
+DECLOP_3VAR_COMP(char3, <=)
+DECLOP_3VAR_COMP(char3, >=)
+
+DECLOP_3VAR_COMP(char3, &&)
+DECLOP_3VAR_COMP(char3, ||)
+
+DECLOP_3VAR_1IN_1OUT(char3, ~)
+DECLOP_3VAR_1IN_BOOLOUT(char3, !)
+
+DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(char3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(char3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(char3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(char3, float)
+DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(char3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(char3, double)
+DECLOP_3VAR_SCALE_PRODUCT(char3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(char3, signed long long)
+
+// SIGNED CHAR4
+
+DECLOP_4VAR_2IN_1OUT(char4, +)
+DECLOP_4VAR_2IN_1OUT(char4, -)
+DECLOP_4VAR_2IN_1OUT(char4, *)
+DECLOP_4VAR_2IN_1OUT(char4, /)
+DECLOP_4VAR_2IN_1OUT(char4, %)
+DECLOP_4VAR_2IN_1OUT(char4, &)
+DECLOP_4VAR_2IN_1OUT(char4, |)
+DECLOP_4VAR_2IN_1OUT(char4, ^)
+DECLOP_4VAR_2IN_1OUT(char4, <<)
+DECLOP_4VAR_2IN_1OUT(char4, >>)
+
+DECLOP_4VAR_ASSIGN(char4, +=)
+DECLOP_4VAR_ASSIGN(char4, -=)
+DECLOP_4VAR_ASSIGN(char4, *=)
+DECLOP_4VAR_ASSIGN(char4, /=)
+DECLOP_4VAR_ASSIGN(char4, %=)
+DECLOP_4VAR_ASSIGN(char4, &=)
+DECLOP_4VAR_ASSIGN(char4, |=)
+DECLOP_4VAR_ASSIGN(char4, ^=)
+DECLOP_4VAR_ASSIGN(char4, <<=)
+DECLOP_4VAR_ASSIGN(char4, >>=)
+
+DECLOP_4VAR_PREOP(char4, ++)
+DECLOP_4VAR_PREOP(char4, --)
+
+DECLOP_4VAR_POSTOP(char4, ++)
+DECLOP_4VAR_POSTOP(char4, --)
+
+DECLOP_4VAR_COMP(char4, ==)
+DECLOP_4VAR_COMP(char4, !=)
+DECLOP_4VAR_COMP(char4, <)
+DECLOP_4VAR_COMP(char4, >)
+DECLOP_4VAR_COMP(char4, <=)
+DECLOP_4VAR_COMP(char4, >=)
+
+DECLOP_4VAR_COMP(char4, &&)
+DECLOP_4VAR_COMP(char4, ||)
+
+DECLOP_4VAR_1IN_1OUT(char4, ~)
+DECLOP_4VAR_1IN_BOOLOUT(char4, !)
+
+DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(char4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(char4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(char4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(char4, float)
+DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(char4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(char4, double)
+DECLOP_4VAR_SCALE_PRODUCT(char4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(char4, signed long long)
+
+// UNSIGNED SHORT1
+
+DECLOP_1VAR_2IN_1OUT(ushort1, +)
+DECLOP_1VAR_2IN_1OUT(ushort1, -)
+DECLOP_1VAR_2IN_1OUT(ushort1, *)
+DECLOP_1VAR_2IN_1OUT(ushort1, /)
+DECLOP_1VAR_2IN_1OUT(ushort1, %)
+DECLOP_1VAR_2IN_1OUT(ushort1, &)
+DECLOP_1VAR_2IN_1OUT(ushort1, |)
+DECLOP_1VAR_2IN_1OUT(ushort1, ^)
+DECLOP_1VAR_2IN_1OUT(ushort1, <<)
+DECLOP_1VAR_2IN_1OUT(ushort1, >>)
+
+
+DECLOP_1VAR_ASSIGN(ushort1, +=)
+DECLOP_1VAR_ASSIGN(ushort1, -=)
+DECLOP_1VAR_ASSIGN(ushort1, *=)
+DECLOP_1VAR_ASSIGN(ushort1, /=)
+DECLOP_1VAR_ASSIGN(ushort1, %=)
+DECLOP_1VAR_ASSIGN(ushort1, &=)
+DECLOP_1VAR_ASSIGN(ushort1, |=)
+DECLOP_1VAR_ASSIGN(ushort1, ^=)
+DECLOP_1VAR_ASSIGN(ushort1, <<=)
+DECLOP_1VAR_ASSIGN(ushort1, >>=)
+
+DECLOP_1VAR_PREOP(ushort1, ++)
+DECLOP_1VAR_PREOP(ushort1, --)
+
+DECLOP_1VAR_POSTOP(ushort1, ++)
+DECLOP_1VAR_POSTOP(ushort1, --)
+
+DECLOP_1VAR_COMP(ushort1, ==)
+DECLOP_1VAR_COMP(ushort1, !=)
+DECLOP_1VAR_COMP(ushort1, <)
+DECLOP_1VAR_COMP(ushort1, >)
+DECLOP_1VAR_COMP(ushort1, <=)
+DECLOP_1VAR_COMP(ushort1, >=)
+
+DECLOP_1VAR_COMP(ushort1, &&)
+DECLOP_1VAR_COMP(ushort1, ||)
+
+DECLOP_1VAR_1IN_1OUT(ushort1, ~)
+DECLOP_1VAR_1IN_BOOLOUT(ushort1, !)
+
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, float)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, double)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(ushort1, signed long long)
+
+// UNSIGNED SHORT2
+
+DECLOP_2VAR_2IN_1OUT(ushort2, +)
+DECLOP_2VAR_2IN_1OUT(ushort2, -)
+DECLOP_2VAR_2IN_1OUT(ushort2, *)
+DECLOP_2VAR_2IN_1OUT(ushort2, /)
+DECLOP_2VAR_2IN_1OUT(ushort2, %)
+DECLOP_2VAR_2IN_1OUT(ushort2, &)
+DECLOP_2VAR_2IN_1OUT(ushort2, |)
+DECLOP_2VAR_2IN_1OUT(ushort2, ^)
+DECLOP_2VAR_2IN_1OUT(ushort2, <<)
+DECLOP_2VAR_2IN_1OUT(ushort2, >>)
+
+DECLOP_2VAR_ASSIGN(ushort2, +=)
+DECLOP_2VAR_ASSIGN(ushort2, -=)
+DECLOP_2VAR_ASSIGN(ushort2, *=)
+DECLOP_2VAR_ASSIGN(ushort2, /=)
+DECLOP_2VAR_ASSIGN(ushort2, %=)
+DECLOP_2VAR_ASSIGN(ushort2, &=)
+DECLOP_2VAR_ASSIGN(ushort2, |=)
+DECLOP_2VAR_ASSIGN(ushort2, ^=)
+DECLOP_2VAR_ASSIGN(ushort2, <<=)
+DECLOP_2VAR_ASSIGN(ushort2, >>=)
+
+DECLOP_2VAR_PREOP(ushort2, ++)
+DECLOP_2VAR_PREOP(ushort2, --)
+
+DECLOP_2VAR_POSTOP(ushort2, ++)
+DECLOP_2VAR_POSTOP(ushort2, --)
+
+DECLOP_2VAR_COMP(ushort2, ==)
+DECLOP_2VAR_COMP(ushort2, !=)
+DECLOP_2VAR_COMP(ushort2, <)
+DECLOP_2VAR_COMP(ushort2, >)
+DECLOP_2VAR_COMP(ushort2, <=)
+DECLOP_2VAR_COMP(ushort2, >=)
+
+DECLOP_2VAR_COMP(ushort2, &&)
+DECLOP_2VAR_COMP(ushort2, ||)
+
+DECLOP_2VAR_1IN_1OUT(ushort2, ~)
+DECLOP_2VAR_1IN_BOOLOUT(ushort2, !)
+
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, float)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, double)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(ushort2, signed long long)
+
+// UNSIGNED SHORT3
+
+DECLOP_3VAR_2IN_1OUT(ushort3, +)
+DECLOP_3VAR_2IN_1OUT(ushort3, -)
+DECLOP_3VAR_2IN_1OUT(ushort3, *)
+DECLOP_3VAR_2IN_1OUT(ushort3, /)
+DECLOP_3VAR_2IN_1OUT(ushort3, %)
+DECLOP_3VAR_2IN_1OUT(ushort3, &)
+DECLOP_3VAR_2IN_1OUT(ushort3, |)
+DECLOP_3VAR_2IN_1OUT(ushort3, ^)
+DECLOP_3VAR_2IN_1OUT(ushort3, <<)
+DECLOP_3VAR_2IN_1OUT(ushort3, >>)
+
+DECLOP_3VAR_ASSIGN(ushort3, +=)
+DECLOP_3VAR_ASSIGN(ushort3, -=)
+DECLOP_3VAR_ASSIGN(ushort3, *=)
+DECLOP_3VAR_ASSIGN(ushort3, /=)
+DECLOP_3VAR_ASSIGN(ushort3, %=)
+DECLOP_3VAR_ASSIGN(ushort3, &=)
+DECLOP_3VAR_ASSIGN(ushort3, |=)
+DECLOP_3VAR_ASSIGN(ushort3, ^=)
+DECLOP_3VAR_ASSIGN(ushort3, <<=)
+DECLOP_3VAR_ASSIGN(ushort3, >>=)
+
+DECLOP_3VAR_PREOP(ushort3, ++)
+DECLOP_3VAR_PREOP(ushort3, --)
+
+DECLOP_3VAR_POSTOP(ushort3, ++)
+DECLOP_3VAR_POSTOP(ushort3, --)
+
+DECLOP_3VAR_COMP(ushort3, ==)
+DECLOP_3VAR_COMP(ushort3, !=)
+DECLOP_3VAR_COMP(ushort3, <)
+DECLOP_3VAR_COMP(ushort3, >)
+DECLOP_3VAR_COMP(ushort3, <=)
+DECLOP_3VAR_COMP(ushort3, >=)
+
+DECLOP_3VAR_COMP(ushort3, &&)
+DECLOP_3VAR_COMP(ushort3, ||)
+
+DECLOP_3VAR_1IN_1OUT(ushort3, ~)
+DECLOP_3VAR_1IN_BOOLOUT(ushort3, !)
+
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, float)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, double)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(ushort3, signed long long)
+
+// UNSIGNED SHORT4
+
+DECLOP_4VAR_2IN_1OUT(ushort4, +)
+DECLOP_4VAR_2IN_1OUT(ushort4, -)
+DECLOP_4VAR_2IN_1OUT(ushort4, *)
+DECLOP_4VAR_2IN_1OUT(ushort4, /)
+DECLOP_4VAR_2IN_1OUT(ushort4, %)
+DECLOP_4VAR_2IN_1OUT(ushort4, &)
+DECLOP_4VAR_2IN_1OUT(ushort4, |)
+DECLOP_4VAR_2IN_1OUT(ushort4, ^)
+DECLOP_4VAR_2IN_1OUT(ushort4, <<)
+DECLOP_4VAR_2IN_1OUT(ushort4, >>)
+
+DECLOP_4VAR_ASSIGN(ushort4, +=)
+DECLOP_4VAR_ASSIGN(ushort4, -=)
+DECLOP_4VAR_ASSIGN(ushort4, *=)
+DECLOP_4VAR_ASSIGN(ushort4, /=)
+DECLOP_4VAR_ASSIGN(ushort4, %=)
+DECLOP_4VAR_ASSIGN(ushort4, &=)
+DECLOP_4VAR_ASSIGN(ushort4, |=)
+DECLOP_4VAR_ASSIGN(ushort4, ^=)
+DECLOP_4VAR_ASSIGN(ushort4, <<=)
+DECLOP_4VAR_ASSIGN(ushort4, >>=)
+
+DECLOP_4VAR_PREOP(ushort4, ++)
+DECLOP_4VAR_PREOP(ushort4, --)
+
+DECLOP_4VAR_POSTOP(ushort4, ++)
+DECLOP_4VAR_POSTOP(ushort4, --)
+
+DECLOP_4VAR_COMP(ushort4, ==)
+DECLOP_4VAR_COMP(ushort4, !=)
+DECLOP_4VAR_COMP(ushort4, <)
+DECLOP_4VAR_COMP(ushort4, >)
+DECLOP_4VAR_COMP(ushort4, <=)
+DECLOP_4VAR_COMP(ushort4, >=)
+
+DECLOP_4VAR_COMP(ushort4, &&)
+DECLOP_4VAR_COMP(ushort4, ||)
+
+DECLOP_4VAR_1IN_1OUT(ushort4, ~)
+DECLOP_4VAR_1IN_BOOLOUT(ushort4, !)
+
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, float)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, double)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(ushort4, signed long long)
+
+// SIGNED SHORT1
+
+DECLOP_1VAR_2IN_1OUT(short1, +)
+DECLOP_1VAR_2IN_1OUT(short1, -)
+DECLOP_1VAR_2IN_1OUT(short1, *)
+DECLOP_1VAR_2IN_1OUT(short1, /)
+DECLOP_1VAR_2IN_1OUT(short1, %)
+DECLOP_1VAR_2IN_1OUT(short1, &)
+DECLOP_1VAR_2IN_1OUT(short1, |)
+DECLOP_1VAR_2IN_1OUT(short1, ^)
+DECLOP_1VAR_2IN_1OUT(short1, <<)
+DECLOP_1VAR_2IN_1OUT(short1, >>)
+
+
+DECLOP_1VAR_ASSIGN(short1, +=)
+DECLOP_1VAR_ASSIGN(short1, -=)
+DECLOP_1VAR_ASSIGN(short1, *=)
+DECLOP_1VAR_ASSIGN(short1, /=)
+DECLOP_1VAR_ASSIGN(short1, %=)
+DECLOP_1VAR_ASSIGN(short1, &=)
+DECLOP_1VAR_ASSIGN(short1, |=)
+DECLOP_1VAR_ASSIGN(short1, ^=)
+DECLOP_1VAR_ASSIGN(short1, <<=)
+DECLOP_1VAR_ASSIGN(short1, >>=)
+
+DECLOP_1VAR_PREOP(short1, ++)
+DECLOP_1VAR_PREOP(short1, --)
+
+DECLOP_1VAR_POSTOP(short1, ++)
+DECLOP_1VAR_POSTOP(short1, --)
+
+DECLOP_1VAR_COMP(short1, ==)
+DECLOP_1VAR_COMP(short1, !=)
+DECLOP_1VAR_COMP(short1, <)
+DECLOP_1VAR_COMP(short1, >)
+DECLOP_1VAR_COMP(short1, <=)
+DECLOP_1VAR_COMP(short1, >=)
+
+DECLOP_1VAR_COMP(short1, &&)
+DECLOP_1VAR_COMP(short1, ||)
+
+DECLOP_1VAR_1IN_1OUT(short1, ~)
+DECLOP_1VAR_1IN_BOOLOUT(short1, !)
+
+DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(short1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(short1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(short1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(short1, float)
+DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(short1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(short1, double)
+DECLOP_1VAR_SCALE_PRODUCT(short1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(short1, signed long long)
+
+// SIGNED SHORT2
+
+DECLOP_2VAR_2IN_1OUT(short2, +)
+DECLOP_2VAR_2IN_1OUT(short2, -)
+DECLOP_2VAR_2IN_1OUT(short2, *)
+DECLOP_2VAR_2IN_1OUT(short2, /)
+DECLOP_2VAR_2IN_1OUT(short2, %)
+DECLOP_2VAR_2IN_1OUT(short2, &)
+DECLOP_2VAR_2IN_1OUT(short2, |)
+DECLOP_2VAR_2IN_1OUT(short2, ^)
+DECLOP_2VAR_2IN_1OUT(short2, <<)
+DECLOP_2VAR_2IN_1OUT(short2, >>)
+
+DECLOP_2VAR_ASSIGN(short2, +=)
+DECLOP_2VAR_ASSIGN(short2, -=)
+DECLOP_2VAR_ASSIGN(short2, *=)
+DECLOP_2VAR_ASSIGN(short2, /=)
+DECLOP_2VAR_ASSIGN(short2, %=)
+DECLOP_2VAR_ASSIGN(short2, &=)
+DECLOP_2VAR_ASSIGN(short2, |=)
+DECLOP_2VAR_ASSIGN(short2, ^=)
+DECLOP_2VAR_ASSIGN(short2, <<=)
+DECLOP_2VAR_ASSIGN(short2, >>=)
+
+DECLOP_2VAR_PREOP(short2, ++)
+DECLOP_2VAR_PREOP(short2, --)
+
+DECLOP_2VAR_POSTOP(short2, ++)
+DECLOP_2VAR_POSTOP(short2, --)
+
+DECLOP_2VAR_COMP(short2, ==)
+DECLOP_2VAR_COMP(short2, !=)
+DECLOP_2VAR_COMP(short2, <)
+DECLOP_2VAR_COMP(short2, >)
+DECLOP_2VAR_COMP(short2, <=)
+DECLOP_2VAR_COMP(short2, >=)
+
+DECLOP_2VAR_COMP(short2, &&)
+DECLOP_2VAR_COMP(short2, ||)
+
+DECLOP_2VAR_1IN_1OUT(short2, ~)
+DECLOP_2VAR_1IN_BOOLOUT(short2, !)
+
+DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(short2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(short2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(short2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(short2, float)
+DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(short2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(short2, double)
+DECLOP_2VAR_SCALE_PRODUCT(short2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(short2, signed long long)
+
+// SIGNED SHORT3
+
+DECLOP_3VAR_2IN_1OUT(short3, +)
+DECLOP_3VAR_2IN_1OUT(short3, -)
+DECLOP_3VAR_2IN_1OUT(short3, *)
+DECLOP_3VAR_2IN_1OUT(short3, /)
+DECLOP_3VAR_2IN_1OUT(short3, %)
+DECLOP_3VAR_2IN_1OUT(short3, &)
+DECLOP_3VAR_2IN_1OUT(short3, |)
+DECLOP_3VAR_2IN_1OUT(short3, ^)
+DECLOP_3VAR_2IN_1OUT(short3, <<)
+DECLOP_3VAR_2IN_1OUT(short3, >>)
+
+DECLOP_3VAR_ASSIGN(short3, +=)
+DECLOP_3VAR_ASSIGN(short3, -=)
+DECLOP_3VAR_ASSIGN(short3, *=)
+DECLOP_3VAR_ASSIGN(short3, /=)
+DECLOP_3VAR_ASSIGN(short3, %=)
+DECLOP_3VAR_ASSIGN(short3, &=)
+DECLOP_3VAR_ASSIGN(short3, |=)
+DECLOP_3VAR_ASSIGN(short3, ^=)
+DECLOP_3VAR_ASSIGN(short3, <<=)
+DECLOP_3VAR_ASSIGN(short3, >>=)
+
+DECLOP_3VAR_PREOP(short3, ++)
+DECLOP_3VAR_PREOP(short3, --)
+
+DECLOP_3VAR_POSTOP(short3, ++)
+DECLOP_3VAR_POSTOP(short3, --)
+
+DECLOP_3VAR_COMP(short3, ==)
+DECLOP_3VAR_COMP(short3, !=)
+DECLOP_3VAR_COMP(short3, <)
+DECLOP_3VAR_COMP(short3, >)
+DECLOP_3VAR_COMP(short3, <=)
+DECLOP_3VAR_COMP(short3, >=)
+
+DECLOP_3VAR_COMP(short3, &&)
+DECLOP_3VAR_COMP(short3, ||)
+
+DECLOP_3VAR_1IN_1OUT(short3, ~)
+DECLOP_3VAR_1IN_BOOLOUT(short3, !)
+
+DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(short3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(short3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(short3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(short3, float)
+DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(short3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(short3, double)
+DECLOP_3VAR_SCALE_PRODUCT(short3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(short3, signed long long)
+
+// SIGNED SHORT4
+
+DECLOP_4VAR_2IN_1OUT(short4, +)
+DECLOP_4VAR_2IN_1OUT(short4, -)
+DECLOP_4VAR_2IN_1OUT(short4, *)
+DECLOP_4VAR_2IN_1OUT(short4, /)
+DECLOP_4VAR_2IN_1OUT(short4, %)
+DECLOP_4VAR_2IN_1OUT(short4, &)
+DECLOP_4VAR_2IN_1OUT(short4, |)
+DECLOP_4VAR_2IN_1OUT(short4, ^)
+DECLOP_4VAR_2IN_1OUT(short4, <<)
+DECLOP_4VAR_2IN_1OUT(short4, >>)
+
+DECLOP_4VAR_ASSIGN(short4, +=)
+DECLOP_4VAR_ASSIGN(short4, -=)
+DECLOP_4VAR_ASSIGN(short4, *=)
+DECLOP_4VAR_ASSIGN(short4, /=)
+DECLOP_4VAR_ASSIGN(short4, %=)
+DECLOP_4VAR_ASSIGN(short4, &=)
+DECLOP_4VAR_ASSIGN(short4, |=)
+DECLOP_4VAR_ASSIGN(short4, ^=)
+DECLOP_4VAR_ASSIGN(short4, <<=)
+DECLOP_4VAR_ASSIGN(short4, >>=)
+
+DECLOP_4VAR_PREOP(short4, ++)
+DECLOP_4VAR_PREOP(short4, --)
+
+DECLOP_4VAR_POSTOP(short4, ++)
+DECLOP_4VAR_POSTOP(short4, --)
+
+DECLOP_4VAR_COMP(short4, ==)
+DECLOP_4VAR_COMP(short4, !=)
+DECLOP_4VAR_COMP(short4, <)
+DECLOP_4VAR_COMP(short4, >)
+DECLOP_4VAR_COMP(short4, <=)
+DECLOP_4VAR_COMP(short4, >=)
+
+DECLOP_4VAR_COMP(short4, &&)
+DECLOP_4VAR_COMP(short4, ||)
+
+DECLOP_4VAR_1IN_1OUT(short4, ~)
+DECLOP_4VAR_1IN_BOOLOUT(short4, !)
+
+DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(short4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(short4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(short4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(short4, float)
+DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(short4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(short4, double)
+DECLOP_4VAR_SCALE_PRODUCT(short4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(short4, signed long long)
+
+// UNSIGNED INT1
+
+DECLOP_1VAR_2IN_1OUT(uint1, +)
+DECLOP_1VAR_2IN_1OUT(uint1, -)
+DECLOP_1VAR_2IN_1OUT(uint1, *)
+DECLOP_1VAR_2IN_1OUT(uint1, /)
+DECLOP_1VAR_2IN_1OUT(uint1, %)
+DECLOP_1VAR_2IN_1OUT(uint1, &)
+DECLOP_1VAR_2IN_1OUT(uint1, |)
+DECLOP_1VAR_2IN_1OUT(uint1, ^)
+DECLOP_1VAR_2IN_1OUT(uint1, <<)
+DECLOP_1VAR_2IN_1OUT(uint1, >>)
+
+
+DECLOP_1VAR_ASSIGN(uint1, +=)
+DECLOP_1VAR_ASSIGN(uint1, -=)
+DECLOP_1VAR_ASSIGN(uint1, *=)
+DECLOP_1VAR_ASSIGN(uint1, /=)
+DECLOP_1VAR_ASSIGN(uint1, %=)
+DECLOP_1VAR_ASSIGN(uint1, &=)
+DECLOP_1VAR_ASSIGN(uint1, |=)
+DECLOP_1VAR_ASSIGN(uint1, ^=)
+DECLOP_1VAR_ASSIGN(uint1, <<=)
+DECLOP_1VAR_ASSIGN(uint1, >>=)
+
+DECLOP_1VAR_PREOP(uint1, ++)
+DECLOP_1VAR_PREOP(uint1, --)
+
+DECLOP_1VAR_POSTOP(uint1, ++)
+DECLOP_1VAR_POSTOP(uint1, --)
+
+DECLOP_1VAR_COMP(uint1, ==)
+DECLOP_1VAR_COMP(uint1, !=)
+DECLOP_1VAR_COMP(uint1, <)
+DECLOP_1VAR_COMP(uint1, >)
+DECLOP_1VAR_COMP(uint1, <=)
+DECLOP_1VAR_COMP(uint1, >=)
+
+DECLOP_1VAR_COMP(uint1, &&)
+DECLOP_1VAR_COMP(uint1, ||)
+
+DECLOP_1VAR_1IN_1OUT(uint1, ~)
+DECLOP_1VAR_1IN_BOOLOUT(uint1, !)
+
+DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, float)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, double)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(uint1, signed long long)
+
+// UNSIGNED INT2
+
+DECLOP_2VAR_2IN_1OUT(uint2, +)
+DECLOP_2VAR_2IN_1OUT(uint2, -)
+DECLOP_2VAR_2IN_1OUT(uint2, *)
+DECLOP_2VAR_2IN_1OUT(uint2, /)
+DECLOP_2VAR_2IN_1OUT(uint2, %)
+DECLOP_2VAR_2IN_1OUT(uint2, &)
+DECLOP_2VAR_2IN_1OUT(uint2, |)
+DECLOP_2VAR_2IN_1OUT(uint2, ^)
+DECLOP_2VAR_2IN_1OUT(uint2, <<)
+DECLOP_2VAR_2IN_1OUT(uint2, >>)
+
+DECLOP_2VAR_ASSIGN(uint2, +=)
+DECLOP_2VAR_ASSIGN(uint2, -=)
+DECLOP_2VAR_ASSIGN(uint2, *=)
+DECLOP_2VAR_ASSIGN(uint2, /=)
+DECLOP_2VAR_ASSIGN(uint2, %=)
+DECLOP_2VAR_ASSIGN(uint2, &=)
+DECLOP_2VAR_ASSIGN(uint2, |=)
+DECLOP_2VAR_ASSIGN(uint2, ^=)
+DECLOP_2VAR_ASSIGN(uint2, <<=)
+DECLOP_2VAR_ASSIGN(uint2, >>=)
+
+DECLOP_2VAR_PREOP(uint2, ++)
+DECLOP_2VAR_PREOP(uint2, --)
+
+DECLOP_2VAR_POSTOP(uint2, ++)
+DECLOP_2VAR_POSTOP(uint2, --)
+
+DECLOP_2VAR_COMP(uint2, ==)
+DECLOP_2VAR_COMP(uint2, !=)
+DECLOP_2VAR_COMP(uint2, <)
+DECLOP_2VAR_COMP(uint2, >)
+DECLOP_2VAR_COMP(uint2, <=)
+DECLOP_2VAR_COMP(uint2, >=)
+
+DECLOP_2VAR_COMP(uint2, &&)
+DECLOP_2VAR_COMP(uint2, ||)
+
+DECLOP_2VAR_1IN_1OUT(uint2, ~)
+DECLOP_2VAR_1IN_BOOLOUT(uint2, !)
+
+DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, float)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, double)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(uint2, signed long long)
+
+// UNSIGNED INT3
+
+DECLOP_3VAR_2IN_1OUT(uint3, +)
+DECLOP_3VAR_2IN_1OUT(uint3, -)
+DECLOP_3VAR_2IN_1OUT(uint3, *)
+DECLOP_3VAR_2IN_1OUT(uint3, /)
+DECLOP_3VAR_2IN_1OUT(uint3, %)
+DECLOP_3VAR_2IN_1OUT(uint3, &)
+DECLOP_3VAR_2IN_1OUT(uint3, |)
+DECLOP_3VAR_2IN_1OUT(uint3, ^)
+DECLOP_3VAR_2IN_1OUT(uint3, <<)
+DECLOP_3VAR_2IN_1OUT(uint3, >>)
+
+DECLOP_3VAR_ASSIGN(uint3, +=)
+DECLOP_3VAR_ASSIGN(uint3, -=)
+DECLOP_3VAR_ASSIGN(uint3, *=)
+DECLOP_3VAR_ASSIGN(uint3, /=)
+DECLOP_3VAR_ASSIGN(uint3, %=)
+DECLOP_3VAR_ASSIGN(uint3, &=)
+DECLOP_3VAR_ASSIGN(uint3, |=)
+DECLOP_3VAR_ASSIGN(uint3, ^=)
+DECLOP_3VAR_ASSIGN(uint3, <<=)
+DECLOP_3VAR_ASSIGN(uint3, >>=)
+
+DECLOP_3VAR_PREOP(uint3, ++)
+DECLOP_3VAR_PREOP(uint3, --)
+
+DECLOP_3VAR_POSTOP(uint3, ++)
+DECLOP_3VAR_POSTOP(uint3, --)
+
+DECLOP_3VAR_COMP(uint3, ==)
+DECLOP_3VAR_COMP(uint3, !=)
+DECLOP_3VAR_COMP(uint3, <)
+DECLOP_3VAR_COMP(uint3, >)
+DECLOP_3VAR_COMP(uint3, <=)
+DECLOP_3VAR_COMP(uint3, >=)
+
+DECLOP_3VAR_COMP(uint3, &&)
+DECLOP_3VAR_COMP(uint3, ||)
+
+DECLOP_3VAR_1IN_1OUT(uint3, ~)
+DECLOP_3VAR_1IN_BOOLOUT(uint3, !)
+
+DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, float)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, double)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(uint3, signed long long)
+
+// UNSIGNED INT4
+
+DECLOP_4VAR_2IN_1OUT(uint4, +)
+DECLOP_4VAR_2IN_1OUT(uint4, -)
+DECLOP_4VAR_2IN_1OUT(uint4, *)
+DECLOP_4VAR_2IN_1OUT(uint4, /)
+DECLOP_4VAR_2IN_1OUT(uint4, %)
+DECLOP_4VAR_2IN_1OUT(uint4, &)
+DECLOP_4VAR_2IN_1OUT(uint4, |)
+DECLOP_4VAR_2IN_1OUT(uint4, ^)
+DECLOP_4VAR_2IN_1OUT(uint4, <<)
+DECLOP_4VAR_2IN_1OUT(uint4, >>)
+
+DECLOP_4VAR_ASSIGN(uint4, +=)
+DECLOP_4VAR_ASSIGN(uint4, -=)
+DECLOP_4VAR_ASSIGN(uint4, *=)
+DECLOP_4VAR_ASSIGN(uint4, /=)
+DECLOP_4VAR_ASSIGN(uint4, %=)
+DECLOP_4VAR_ASSIGN(uint4, &=)
+DECLOP_4VAR_ASSIGN(uint4, |=)
+DECLOP_4VAR_ASSIGN(uint4, ^=)
+DECLOP_4VAR_ASSIGN(uint4, <<=)
+DECLOP_4VAR_ASSIGN(uint4, >>=)
+
+DECLOP_4VAR_PREOP(uint4, ++)
+DECLOP_4VAR_PREOP(uint4, --)
+
+DECLOP_4VAR_POSTOP(uint4, ++)
+DECLOP_4VAR_POSTOP(uint4, --)
+
+DECLOP_4VAR_COMP(uint4, ==)
+DECLOP_4VAR_COMP(uint4, !=)
+DECLOP_4VAR_COMP(uint4, <)
+DECLOP_4VAR_COMP(uint4, >)
+DECLOP_4VAR_COMP(uint4, <=)
+DECLOP_4VAR_COMP(uint4, >=)
+
+DECLOP_4VAR_COMP(uint4, &&)
+DECLOP_4VAR_COMP(uint4, ||)
+
+DECLOP_4VAR_1IN_1OUT(uint4, ~)
+DECLOP_4VAR_1IN_BOOLOUT(uint4, !)
+
+DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, float)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, double)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(uint4, signed long long)
+
+// SIGNED INT1
+
+DECLOP_1VAR_2IN_1OUT(int1, +)
+DECLOP_1VAR_2IN_1OUT(int1, -)
+DECLOP_1VAR_2IN_1OUT(int1, *)
+DECLOP_1VAR_2IN_1OUT(int1, /)
+DECLOP_1VAR_2IN_1OUT(int1, %)
+DECLOP_1VAR_2IN_1OUT(int1, &)
+DECLOP_1VAR_2IN_1OUT(int1, |)
+DECLOP_1VAR_2IN_1OUT(int1, ^)
+DECLOP_1VAR_2IN_1OUT(int1, <<)
+DECLOP_1VAR_2IN_1OUT(int1, >>)
+
+
+DECLOP_1VAR_ASSIGN(int1, +=)
+DECLOP_1VAR_ASSIGN(int1, -=)
+DECLOP_1VAR_ASSIGN(int1, *=)
+DECLOP_1VAR_ASSIGN(int1, /=)
+DECLOP_1VAR_ASSIGN(int1, %=)
+DECLOP_1VAR_ASSIGN(int1, &=)
+DECLOP_1VAR_ASSIGN(int1, |=)
+DECLOP_1VAR_ASSIGN(int1, ^=)
+DECLOP_1VAR_ASSIGN(int1, <<=)
+DECLOP_1VAR_ASSIGN(int1, >>=)
+
+DECLOP_1VAR_PREOP(int1, ++)
+DECLOP_1VAR_PREOP(int1, --)
+
+DECLOP_1VAR_POSTOP(int1, ++)
+DECLOP_1VAR_POSTOP(int1, --)
+
+DECLOP_1VAR_COMP(int1, ==)
+DECLOP_1VAR_COMP(int1, !=)
+DECLOP_1VAR_COMP(int1, <)
+DECLOP_1VAR_COMP(int1, >)
+DECLOP_1VAR_COMP(int1, <=)
+DECLOP_1VAR_COMP(int1, >=)
+
+DECLOP_1VAR_COMP(int1, &&)
+DECLOP_1VAR_COMP(int1, ||)
+
+DECLOP_1VAR_1IN_1OUT(int1, ~)
+DECLOP_1VAR_1IN_BOOLOUT(int1, !)
+
+DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(int1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(int1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(int1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(int1, float)
+DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(int1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(int1, double)
+DECLOP_1VAR_SCALE_PRODUCT(int1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(int1, signed long long)
+
+// SIGNED INT2
+
+DECLOP_2VAR_2IN_1OUT(int2, +)
+DECLOP_2VAR_2IN_1OUT(int2, -)
+DECLOP_2VAR_2IN_1OUT(int2, *)
+DECLOP_2VAR_2IN_1OUT(int2, /)
+DECLOP_2VAR_2IN_1OUT(int2, %)
+DECLOP_2VAR_2IN_1OUT(int2, &)
+DECLOP_2VAR_2IN_1OUT(int2, |)
+DECLOP_2VAR_2IN_1OUT(int2, ^)
+DECLOP_2VAR_2IN_1OUT(int2, <<)
+DECLOP_2VAR_2IN_1OUT(int2, >>)
+
+DECLOP_2VAR_ASSIGN(int2, +=)
+DECLOP_2VAR_ASSIGN(int2, -=)
+DECLOP_2VAR_ASSIGN(int2, *=)
+DECLOP_2VAR_ASSIGN(int2, /=)
+DECLOP_2VAR_ASSIGN(int2, %=)
+DECLOP_2VAR_ASSIGN(int2, &=)
+DECLOP_2VAR_ASSIGN(int2, |=)
+DECLOP_2VAR_ASSIGN(int2, ^=)
+DECLOP_2VAR_ASSIGN(int2, <<=)
+DECLOP_2VAR_ASSIGN(int2, >>=)
+
+DECLOP_2VAR_PREOP(int2, ++)
+DECLOP_2VAR_PREOP(int2, --)
+
+DECLOP_2VAR_POSTOP(int2, ++)
+DECLOP_2VAR_POSTOP(int2, --)
+
+DECLOP_2VAR_COMP(int2, ==)
+DECLOP_2VAR_COMP(int2, !=)
+DECLOP_2VAR_COMP(int2, <)
+DECLOP_2VAR_COMP(int2, >)
+DECLOP_2VAR_COMP(int2, <=)
+DECLOP_2VAR_COMP(int2, >=)
+
+DECLOP_2VAR_COMP(int2, &&)
+DECLOP_2VAR_COMP(int2, ||)
+
+DECLOP_2VAR_1IN_1OUT(int2, ~)
+DECLOP_2VAR_1IN_BOOLOUT(int2, !)
+
+DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(int2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(int2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(int2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(int2, float)
+DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(int2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(int2, double)
+DECLOP_2VAR_SCALE_PRODUCT(int2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(int2, signed long long)
+
+// SIGNED INT3
+
+DECLOP_3VAR_2IN_1OUT(int3, +)
+DECLOP_3VAR_2IN_1OUT(int3, -)
+DECLOP_3VAR_2IN_1OUT(int3, *)
+DECLOP_3VAR_2IN_1OUT(int3, /)
+DECLOP_3VAR_2IN_1OUT(int3, %)
+DECLOP_3VAR_2IN_1OUT(int3, &)
+DECLOP_3VAR_2IN_1OUT(int3, |)
+DECLOP_3VAR_2IN_1OUT(int3, ^)
+DECLOP_3VAR_2IN_1OUT(int3, <<)
+DECLOP_3VAR_2IN_1OUT(int3, >>)
+
+DECLOP_3VAR_ASSIGN(int3, +=)
+DECLOP_3VAR_ASSIGN(int3, -=)
+DECLOP_3VAR_ASSIGN(int3, *=)
+DECLOP_3VAR_ASSIGN(int3, /=)
+DECLOP_3VAR_ASSIGN(int3, %=)
+DECLOP_3VAR_ASSIGN(int3, &=)
+DECLOP_3VAR_ASSIGN(int3, |=)
+DECLOP_3VAR_ASSIGN(int3, ^=)
+DECLOP_3VAR_ASSIGN(int3, <<=)
+DECLOP_3VAR_ASSIGN(int3, >>=)
+
+DECLOP_3VAR_PREOP(int3, ++)
+DECLOP_3VAR_PREOP(int3, --)
+
+DECLOP_3VAR_POSTOP(int3, ++)
+DECLOP_3VAR_POSTOP(int3, --)
+
+DECLOP_3VAR_COMP(int3, ==)
+DECLOP_3VAR_COMP(int3, !=)
+DECLOP_3VAR_COMP(int3, <)
+DECLOP_3VAR_COMP(int3, >)
+DECLOP_3VAR_COMP(int3, <=)
+DECLOP_3VAR_COMP(int3, >=)
+
+DECLOP_3VAR_COMP(int3, &&)
+DECLOP_3VAR_COMP(int3, ||)
+
+DECLOP_3VAR_1IN_1OUT(int3, ~)
+DECLOP_3VAR_1IN_BOOLOUT(int3, !)
+
+DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(int3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(int3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(int3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(int3, float)
+DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(int3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(int3, double)
+DECLOP_3VAR_SCALE_PRODUCT(int3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(int3, signed long long)
+
+// SIGNED INT4
+
+DECLOP_4VAR_2IN_1OUT(int4, +)
+DECLOP_4VAR_2IN_1OUT(int4, -)
+DECLOP_4VAR_2IN_1OUT(int4, *)
+DECLOP_4VAR_2IN_1OUT(int4, /)
+DECLOP_4VAR_2IN_1OUT(int4, %)
+DECLOP_4VAR_2IN_1OUT(int4, &)
+DECLOP_4VAR_2IN_1OUT(int4, |)
+DECLOP_4VAR_2IN_1OUT(int4, ^)
+DECLOP_4VAR_2IN_1OUT(int4, <<)
+DECLOP_4VAR_2IN_1OUT(int4, >>)
+
+DECLOP_4VAR_ASSIGN(int4, +=)
+DECLOP_4VAR_ASSIGN(int4, -=)
+DECLOP_4VAR_ASSIGN(int4, *=)
+DECLOP_4VAR_ASSIGN(int4, /=)
+DECLOP_4VAR_ASSIGN(int4, %=)
+DECLOP_4VAR_ASSIGN(int4, &=)
+DECLOP_4VAR_ASSIGN(int4, |=)
+DECLOP_4VAR_ASSIGN(int4, ^=)
+DECLOP_4VAR_ASSIGN(int4, <<=)
+DECLOP_4VAR_ASSIGN(int4, >>=)
+
+DECLOP_4VAR_PREOP(int4, ++)
+DECLOP_4VAR_PREOP(int4, --)
+
+DECLOP_4VAR_POSTOP(int4, ++)
+DECLOP_4VAR_POSTOP(int4, --)
+
+DECLOP_4VAR_COMP(int4, ==)
+DECLOP_4VAR_COMP(int4, !=)
+DECLOP_4VAR_COMP(int4, <)
+DECLOP_4VAR_COMP(int4, >)
+DECLOP_4VAR_COMP(int4, <=)
+DECLOP_4VAR_COMP(int4, >=)
+
+DECLOP_4VAR_COMP(int4, &&)
+DECLOP_4VAR_COMP(int4, ||)
+
+DECLOP_4VAR_1IN_1OUT(int4, ~)
+DECLOP_4VAR_1IN_BOOLOUT(int4, !)
+
+DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(int4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(int4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(int4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(int4, float)
+DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(int4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(int4, double)
+DECLOP_4VAR_SCALE_PRODUCT(int4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(int4, signed long long)
+
+// FLOAT1
+
+DECLOP_1VAR_2IN_1OUT(float1, +)
+DECLOP_1VAR_2IN_1OUT(float1, -)
+DECLOP_1VAR_2IN_1OUT(float1, *)
+DECLOP_1VAR_2IN_1OUT(float1, /)
+
+DECLOP_1VAR_ASSIGN(float1, +=)
+DECLOP_1VAR_ASSIGN(float1, -=)
+DECLOP_1VAR_ASSIGN(float1, *=)
+DECLOP_1VAR_ASSIGN(float1, /=)
+
+DECLOP_1VAR_PREOP(float1, ++)
+DECLOP_1VAR_PREOP(float1, --)
+
+DECLOP_1VAR_POSTOP(float1, ++)
+DECLOP_1VAR_POSTOP(float1, --)
+
+DECLOP_1VAR_COMP(float1, ==)
+DECLOP_1VAR_COMP(float1, !=)
+DECLOP_1VAR_COMP(float1, <)
+DECLOP_1VAR_COMP(float1, >)
+DECLOP_1VAR_COMP(float1, <=)
+DECLOP_1VAR_COMP(float1, >=)
+
+DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(float1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(float1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(float1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(float1, float)
+DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(float1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(float1, double)
+DECLOP_1VAR_SCALE_PRODUCT(float1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(float1, signed long long)
+
+// FLOAT2
+
+DECLOP_2VAR_2IN_1OUT(float2, +)
+DECLOP_2VAR_2IN_1OUT(float2, -)
+DECLOP_2VAR_2IN_1OUT(float2, *)
+DECLOP_2VAR_2IN_1OUT(float2, /)
+
+DECLOP_2VAR_ASSIGN(float2, +=)
+DECLOP_2VAR_ASSIGN(float2, -=)
+DECLOP_2VAR_ASSIGN(float2, *=)
+DECLOP_2VAR_ASSIGN(float2, /=)
+
+DECLOP_2VAR_PREOP(float2, ++)
+DECLOP_2VAR_PREOP(float2, --)
+
+DECLOP_2VAR_POSTOP(float2, ++)
+DECLOP_2VAR_POSTOP(float2, --)
+
+DECLOP_2VAR_COMP(float2, ==)
+DECLOP_2VAR_COMP(float2, !=)
+DECLOP_2VAR_COMP(float2, <)
+DECLOP_2VAR_COMP(float2, >)
+DECLOP_2VAR_COMP(float2, <=)
+DECLOP_2VAR_COMP(float2, >=)
+
+DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(float2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(float2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(float2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(float2, float)
+DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(float2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(float2, double)
+DECLOP_2VAR_SCALE_PRODUCT(float2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(float2, signed long long)
+
+// FLOAT3
+
+DECLOP_3VAR_2IN_1OUT(float3, +)
+DECLOP_3VAR_2IN_1OUT(float3, -)
+DECLOP_3VAR_2IN_1OUT(float3, *)
+DECLOP_3VAR_2IN_1OUT(float3, /)
+
+DECLOP_3VAR_ASSIGN(float3, +=)
+DECLOP_3VAR_ASSIGN(float3, -=)
+DECLOP_3VAR_ASSIGN(float3, *=)
+DECLOP_3VAR_ASSIGN(float3, /=)
+
+DECLOP_3VAR_PREOP(float3, ++)
+DECLOP_3VAR_PREOP(float3, --)
+
+DECLOP_3VAR_POSTOP(float3, ++)
+DECLOP_3VAR_POSTOP(float3, --)
+
+DECLOP_3VAR_COMP(float3, ==)
+DECLOP_3VAR_COMP(float3, !=)
+DECLOP_3VAR_COMP(float3, <)
+DECLOP_3VAR_COMP(float3, >)
+DECLOP_3VAR_COMP(float3, <=)
+DECLOP_3VAR_COMP(float3, >=)
+
+DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(float3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(float3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(float3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(float3, float)
+DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(float3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(float3, double)
+DECLOP_3VAR_SCALE_PRODUCT(float3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(float3, signed long long)
+
+// FLOAT4
+
+DECLOP_4VAR_2IN_1OUT(float4, +)
+DECLOP_4VAR_2IN_1OUT(float4, -)
+DECLOP_4VAR_2IN_1OUT(float4, *)
+DECLOP_4VAR_2IN_1OUT(float4, /)
+
+DECLOP_4VAR_ASSIGN(float4, +=)
+DECLOP_4VAR_ASSIGN(float4, -=)
+DECLOP_4VAR_ASSIGN(float4, *=)
+DECLOP_4VAR_ASSIGN(float4, /=)
+
+DECLOP_4VAR_PREOP(float4, ++)
+DECLOP_4VAR_PREOP(float4, --)
+
+DECLOP_4VAR_POSTOP(float4, ++)
+DECLOP_4VAR_POSTOP(float4, --)
+
+DECLOP_4VAR_COMP(float4, ==)
+DECLOP_4VAR_COMP(float4, !=)
+DECLOP_4VAR_COMP(float4, <)
+DECLOP_4VAR_COMP(float4, >)
+DECLOP_4VAR_COMP(float4, <=)
+DECLOP_4VAR_COMP(float4, >=)
+
+DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(float4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(float4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(float4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(float4, float)
+DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(float4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(float4, double)
+DECLOP_4VAR_SCALE_PRODUCT(float4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(float4, signed long long)
+
+// DOUBLE1
+
+DECLOP_1VAR_2IN_1OUT(double1, +)
+DECLOP_1VAR_2IN_1OUT(double1, -)
+DECLOP_1VAR_2IN_1OUT(double1, *)
+DECLOP_1VAR_2IN_1OUT(double1, /)
+
+DECLOP_1VAR_ASSIGN(double1, +=)
+DECLOP_1VAR_ASSIGN(double1, -=)
+DECLOP_1VAR_ASSIGN(double1, *=)
+DECLOP_1VAR_ASSIGN(double1, /=)
+
+DECLOP_1VAR_PREOP(double1, ++)
+DECLOP_1VAR_PREOP(double1, --)
+
+DECLOP_1VAR_POSTOP(double1, ++)
+DECLOP_1VAR_POSTOP(double1, --)
+
+DECLOP_1VAR_COMP(double1, ==)
+DECLOP_1VAR_COMP(double1, !=)
+DECLOP_1VAR_COMP(double1, <)
+DECLOP_1VAR_COMP(double1, >)
+DECLOP_1VAR_COMP(double1, <=)
+DECLOP_1VAR_COMP(double1, >=)
+
+DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(double1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(double1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(double1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(double1, float)
+DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(double1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(double1, double)
+DECLOP_1VAR_SCALE_PRODUCT(double1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(double1, signed long long)
+
+// DOUBLE2
+
+DECLOP_2VAR_2IN_1OUT(double2, +)
+DECLOP_2VAR_2IN_1OUT(double2, -)
+DECLOP_2VAR_2IN_1OUT(double2, *)
+DECLOP_2VAR_2IN_1OUT(double2, /)
+
+DECLOP_2VAR_ASSIGN(double2, +=)
+DECLOP_2VAR_ASSIGN(double2, -=)
+DECLOP_2VAR_ASSIGN(double2, *=)
+DECLOP_2VAR_ASSIGN(double2, /=)
+
+DECLOP_2VAR_PREOP(double2, ++)
+DECLOP_2VAR_PREOP(double2, --)
+
+DECLOP_2VAR_POSTOP(double2, ++)
+DECLOP_2VAR_POSTOP(double2, --)
+
+DECLOP_2VAR_COMP(double2, ==)
+DECLOP_2VAR_COMP(double2, !=)
+DECLOP_2VAR_COMP(double2, <)
+DECLOP_2VAR_COMP(double2, >)
+DECLOP_2VAR_COMP(double2, <=)
+DECLOP_2VAR_COMP(double2, >=)
+
+DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(double2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(double2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(double2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(double2, float)
+DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(double2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(double2, double)
+DECLOP_2VAR_SCALE_PRODUCT(double2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(double2, signed long long)
+
+// DOUBLE3
+
+DECLOP_3VAR_2IN_1OUT(double3, +)
+DECLOP_3VAR_2IN_1OUT(double3, -)
+DECLOP_3VAR_2IN_1OUT(double3, *)
+DECLOP_3VAR_2IN_1OUT(double3, /)
+
+DECLOP_3VAR_ASSIGN(double3, +=)
+DECLOP_3VAR_ASSIGN(double3, -=)
+DECLOP_3VAR_ASSIGN(double3, *=)
+DECLOP_3VAR_ASSIGN(double3, /=)
+
+DECLOP_3VAR_PREOP(double3, ++)
+DECLOP_3VAR_PREOP(double3, --)
+
+DECLOP_3VAR_POSTOP(double3, ++)
+DECLOP_3VAR_POSTOP(double3, --)
+
+DECLOP_3VAR_COMP(double3, ==)
+DECLOP_3VAR_COMP(double3, !=)
+DECLOP_3VAR_COMP(double3, <)
+DECLOP_3VAR_COMP(double3, >)
+DECLOP_3VAR_COMP(double3, <=)
+DECLOP_3VAR_COMP(double3, >=)
+
+DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(double3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(double3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(double3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(double3, float)
+DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(double3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(double3, double)
+DECLOP_3VAR_SCALE_PRODUCT(double3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(double3, signed long long)
+
+// DOUBLE4
+
+DECLOP_4VAR_2IN_1OUT(double4, +)
+DECLOP_4VAR_2IN_1OUT(double4, -)
+DECLOP_4VAR_2IN_1OUT(double4, *)
+DECLOP_4VAR_2IN_1OUT(double4, /)
+
+DECLOP_4VAR_ASSIGN(double4, +=)
+DECLOP_4VAR_ASSIGN(double4, -=)
+DECLOP_4VAR_ASSIGN(double4, *=)
+DECLOP_4VAR_ASSIGN(double4, /=)
+
+DECLOP_4VAR_PREOP(double4, ++)
+DECLOP_4VAR_PREOP(double4, --)
+
+DECLOP_4VAR_POSTOP(double4, ++)
+DECLOP_4VAR_POSTOP(double4, --)
+
+DECLOP_4VAR_COMP(double4, ==)
+DECLOP_4VAR_COMP(double4, !=)
+DECLOP_4VAR_COMP(double4, <)
+DECLOP_4VAR_COMP(double4, >)
+DECLOP_4VAR_COMP(double4, <=)
+DECLOP_4VAR_COMP(double4, >=)
+
+DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(double4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(double4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(double4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(double4, float)
+DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(double4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(double4, double)
+DECLOP_4VAR_SCALE_PRODUCT(double4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(double4, signed long long)
+
+// UNSIGNED LONG1
+
+DECLOP_1VAR_2IN_1OUT(ulong1, +)
+DECLOP_1VAR_2IN_1OUT(ulong1, -)
+DECLOP_1VAR_2IN_1OUT(ulong1, *)
+DECLOP_1VAR_2IN_1OUT(ulong1, /)
+DECLOP_1VAR_2IN_1OUT(ulong1, %)
+DECLOP_1VAR_2IN_1OUT(ulong1, &)
+DECLOP_1VAR_2IN_1OUT(ulong1, |)
+DECLOP_1VAR_2IN_1OUT(ulong1, ^)
+DECLOP_1VAR_2IN_1OUT(ulong1, <<)
+DECLOP_1VAR_2IN_1OUT(ulong1, >>)
+
+
+DECLOP_1VAR_ASSIGN(ulong1, +=)
+DECLOP_1VAR_ASSIGN(ulong1, -=)
+DECLOP_1VAR_ASSIGN(ulong1, *=)
+DECLOP_1VAR_ASSIGN(ulong1, /=)
+DECLOP_1VAR_ASSIGN(ulong1, %=)
+DECLOP_1VAR_ASSIGN(ulong1, &=)
+DECLOP_1VAR_ASSIGN(ulong1, |=)
+DECLOP_1VAR_ASSIGN(ulong1, ^=)
+DECLOP_1VAR_ASSIGN(ulong1, <<=)
+DECLOP_1VAR_ASSIGN(ulong1, >>=)
+
+DECLOP_1VAR_PREOP(ulong1, ++)
+DECLOP_1VAR_PREOP(ulong1, --)
+
+DECLOP_1VAR_POSTOP(ulong1, ++)
+DECLOP_1VAR_POSTOP(ulong1, --)
+
+DECLOP_1VAR_COMP(ulong1, ==)
+DECLOP_1VAR_COMP(ulong1, !=)
+DECLOP_1VAR_COMP(ulong1, <)
+DECLOP_1VAR_COMP(ulong1, >)
+DECLOP_1VAR_COMP(ulong1, <=)
+DECLOP_1VAR_COMP(ulong1, >=)
+
+DECLOP_1VAR_COMP(ulong1, &&)
+DECLOP_1VAR_COMP(ulong1, ||)
+
+DECLOP_1VAR_1IN_1OUT(ulong1, ~)
+DECLOP_1VAR_1IN_BOOLOUT(ulong1, !)
+
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, float)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, double)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(ulong1, signed long long)
+
+// UNSIGNED LONG2
+
+DECLOP_2VAR_2IN_1OUT(ulong2, +)
+DECLOP_2VAR_2IN_1OUT(ulong2, -)
+DECLOP_2VAR_2IN_1OUT(ulong2, *)
+DECLOP_2VAR_2IN_1OUT(ulong2, /)
+DECLOP_2VAR_2IN_1OUT(ulong2, %)
+DECLOP_2VAR_2IN_1OUT(ulong2, &)
+DECLOP_2VAR_2IN_1OUT(ulong2, |)
+DECLOP_2VAR_2IN_1OUT(ulong2, ^)
+DECLOP_2VAR_2IN_1OUT(ulong2, <<)
+DECLOP_2VAR_2IN_1OUT(ulong2, >>)
+
+DECLOP_2VAR_ASSIGN(ulong2, +=)
+DECLOP_2VAR_ASSIGN(ulong2, -=)
+DECLOP_2VAR_ASSIGN(ulong2, *=)
+DECLOP_2VAR_ASSIGN(ulong2, /=)
+DECLOP_2VAR_ASSIGN(ulong2, %=)
+DECLOP_2VAR_ASSIGN(ulong2, &=)
+DECLOP_2VAR_ASSIGN(ulong2, |=)
+DECLOP_2VAR_ASSIGN(ulong2, ^=)
+DECLOP_2VAR_ASSIGN(ulong2, <<=)
+DECLOP_2VAR_ASSIGN(ulong2, >>=)
+
+DECLOP_2VAR_PREOP(ulong2, ++)
+DECLOP_2VAR_PREOP(ulong2, --)
+
+DECLOP_2VAR_POSTOP(ulong2, ++)
+DECLOP_2VAR_POSTOP(ulong2, --)
+
+DECLOP_2VAR_COMP(ulong2, ==)
+DECLOP_2VAR_COMP(ulong2, !=)
+DECLOP_2VAR_COMP(ulong2, <)
+DECLOP_2VAR_COMP(ulong2, >)
+DECLOP_2VAR_COMP(ulong2, <=)
+DECLOP_2VAR_COMP(ulong2, >=)
+
+DECLOP_2VAR_COMP(ulong2, &&)
+DECLOP_2VAR_COMP(ulong2, ||)
+
+DECLOP_2VAR_1IN_1OUT(ulong2, ~)
+DECLOP_2VAR_1IN_BOOLOUT(ulong2, !)
+
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, float)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, double)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(ulong2, signed long long)
+
+// UNSIGNED LONG3
+
+DECLOP_3VAR_2IN_1OUT(ulong3, +)
+DECLOP_3VAR_2IN_1OUT(ulong3, -)
+DECLOP_3VAR_2IN_1OUT(ulong3, *)
+DECLOP_3VAR_2IN_1OUT(ulong3, /)
+DECLOP_3VAR_2IN_1OUT(ulong3, %)
+DECLOP_3VAR_2IN_1OUT(ulong3, &)
+DECLOP_3VAR_2IN_1OUT(ulong3, |)
+DECLOP_3VAR_2IN_1OUT(ulong3, ^)
+DECLOP_3VAR_2IN_1OUT(ulong3, <<)
+DECLOP_3VAR_2IN_1OUT(ulong3, >>)
+
+DECLOP_3VAR_ASSIGN(ulong3, +=)
+DECLOP_3VAR_ASSIGN(ulong3, -=)
+DECLOP_3VAR_ASSIGN(ulong3, *=)
+DECLOP_3VAR_ASSIGN(ulong3, /=)
+DECLOP_3VAR_ASSIGN(ulong3, %=)
+DECLOP_3VAR_ASSIGN(ulong3, &=)
+DECLOP_3VAR_ASSIGN(ulong3, |=)
+DECLOP_3VAR_ASSIGN(ulong3, ^=)
+DECLOP_3VAR_ASSIGN(ulong3, <<=)
+DECLOP_3VAR_ASSIGN(ulong3, >>=)
+
+DECLOP_3VAR_PREOP(ulong3, ++)
+DECLOP_3VAR_PREOP(ulong3, --)
+
+DECLOP_3VAR_POSTOP(ulong3, ++)
+DECLOP_3VAR_POSTOP(ulong3, --)
+
+DECLOP_3VAR_COMP(ulong3, ==)
+DECLOP_3VAR_COMP(ulong3, !=)
+DECLOP_3VAR_COMP(ulong3, <)
+DECLOP_3VAR_COMP(ulong3, >)
+DECLOP_3VAR_COMP(ulong3, <=)
+DECLOP_3VAR_COMP(ulong3, >=)
+
+DECLOP_3VAR_COMP(ulong3, &&)
+DECLOP_3VAR_COMP(ulong3, ||)
+
+DECLOP_3VAR_1IN_1OUT(ulong3, ~)
+DECLOP_3VAR_1IN_BOOLOUT(ulong3, !)
+
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, float)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, double)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(ulong3, signed long long)
+
+// UNSIGNED LONG4
+
+DECLOP_4VAR_2IN_1OUT(ulong4, +)
+DECLOP_4VAR_2IN_1OUT(ulong4, -)
+DECLOP_4VAR_2IN_1OUT(ulong4, *)
+DECLOP_4VAR_2IN_1OUT(ulong4, /)
+DECLOP_4VAR_2IN_1OUT(ulong4, %)
+DECLOP_4VAR_2IN_1OUT(ulong4, &)
+DECLOP_4VAR_2IN_1OUT(ulong4, |)
+DECLOP_4VAR_2IN_1OUT(ulong4, ^)
+DECLOP_4VAR_2IN_1OUT(ulong4, <<)
+DECLOP_4VAR_2IN_1OUT(ulong4, >>)
+
+DECLOP_4VAR_ASSIGN(ulong4, +=)
+DECLOP_4VAR_ASSIGN(ulong4, -=)
+DECLOP_4VAR_ASSIGN(ulong4, *=)
+DECLOP_4VAR_ASSIGN(ulong4, /=)
+DECLOP_4VAR_ASSIGN(ulong4, %=)
+DECLOP_4VAR_ASSIGN(ulong4, &=)
+DECLOP_4VAR_ASSIGN(ulong4, |=)
+DECLOP_4VAR_ASSIGN(ulong4, ^=)
+DECLOP_4VAR_ASSIGN(ulong4, <<=)
+DECLOP_4VAR_ASSIGN(ulong4, >>=)
+
+DECLOP_4VAR_PREOP(ulong4, ++)
+DECLOP_4VAR_PREOP(ulong4, --)
+
+DECLOP_4VAR_POSTOP(ulong4, ++)
+DECLOP_4VAR_POSTOP(ulong4, --)
+
+DECLOP_4VAR_COMP(ulong4, ==)
+DECLOP_4VAR_COMP(ulong4, !=)
+DECLOP_4VAR_COMP(ulong4, <)
+DECLOP_4VAR_COMP(ulong4, >)
+DECLOP_4VAR_COMP(ulong4, <=)
+DECLOP_4VAR_COMP(ulong4, >=)
+
+DECLOP_4VAR_COMP(ulong4, &&)
+DECLOP_4VAR_COMP(ulong4, ||)
+
+DECLOP_4VAR_1IN_1OUT(ulong4, ~)
+DECLOP_4VAR_1IN_BOOLOUT(ulong4, !)
+
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, float)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, double)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(ulong4, signed long long)
+
+// SIGNED LONG1
+
+DECLOP_1VAR_2IN_1OUT(long1, +)
+DECLOP_1VAR_2IN_1OUT(long1, -)
+DECLOP_1VAR_2IN_1OUT(long1, *)
+DECLOP_1VAR_2IN_1OUT(long1, /)
+DECLOP_1VAR_2IN_1OUT(long1, %)
+DECLOP_1VAR_2IN_1OUT(long1, &)
+DECLOP_1VAR_2IN_1OUT(long1, |)
+DECLOP_1VAR_2IN_1OUT(long1, ^)
+DECLOP_1VAR_2IN_1OUT(long1, <<)
+DECLOP_1VAR_2IN_1OUT(long1, >>)
+
+
+DECLOP_1VAR_ASSIGN(long1, +=)
+DECLOP_1VAR_ASSIGN(long1, -=)
+DECLOP_1VAR_ASSIGN(long1, *=)
+DECLOP_1VAR_ASSIGN(long1, /=)
+DECLOP_1VAR_ASSIGN(long1, %=)
+DECLOP_1VAR_ASSIGN(long1, &=)
+DECLOP_1VAR_ASSIGN(long1, |=)
+DECLOP_1VAR_ASSIGN(long1, ^=)
+DECLOP_1VAR_ASSIGN(long1, <<=)
+DECLOP_1VAR_ASSIGN(long1, >>=)
+
+DECLOP_1VAR_PREOP(long1, ++)
+DECLOP_1VAR_PREOP(long1, --)
+
+DECLOP_1VAR_POSTOP(long1, ++)
+DECLOP_1VAR_POSTOP(long1, --)
+
+DECLOP_1VAR_COMP(long1, ==)
+DECLOP_1VAR_COMP(long1, !=)
+DECLOP_1VAR_COMP(long1, <)
+DECLOP_1VAR_COMP(long1, >)
+DECLOP_1VAR_COMP(long1, <=)
+DECLOP_1VAR_COMP(long1, >=)
+
+DECLOP_1VAR_COMP(long1, &&)
+DECLOP_1VAR_COMP(long1, ||)
+
+DECLOP_1VAR_1IN_1OUT(long1, ~)
+DECLOP_1VAR_1IN_BOOLOUT(long1, !)
+
+DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(long1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(long1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(long1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(long1, float)
+DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(long1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(long1, double)
+DECLOP_1VAR_SCALE_PRODUCT(long1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(long1, signed long long)
+
+// SIGNED LONG2
+
+DECLOP_2VAR_2IN_1OUT(long2, +)
+DECLOP_2VAR_2IN_1OUT(long2, -)
+DECLOP_2VAR_2IN_1OUT(long2, *)
+DECLOP_2VAR_2IN_1OUT(long2, /)
+DECLOP_2VAR_2IN_1OUT(long2, %)
+DECLOP_2VAR_2IN_1OUT(long2, &)
+DECLOP_2VAR_2IN_1OUT(long2, |)
+DECLOP_2VAR_2IN_1OUT(long2, ^)
+DECLOP_2VAR_2IN_1OUT(long2, <<)
+DECLOP_2VAR_2IN_1OUT(long2, >>)
+
+DECLOP_2VAR_ASSIGN(long2, +=)
+DECLOP_2VAR_ASSIGN(long2, -=)
+DECLOP_2VAR_ASSIGN(long2, *=)
+DECLOP_2VAR_ASSIGN(long2, /=)
+DECLOP_2VAR_ASSIGN(long2, %=)
+DECLOP_2VAR_ASSIGN(long2, &=)
+DECLOP_2VAR_ASSIGN(long2, |=)
+DECLOP_2VAR_ASSIGN(long2, ^=)
+DECLOP_2VAR_ASSIGN(long2, <<=)
+DECLOP_2VAR_ASSIGN(long2, >>=)
+
+DECLOP_2VAR_PREOP(long2, ++)
+DECLOP_2VAR_PREOP(long2, --)
+
+DECLOP_2VAR_POSTOP(long2, ++)
+DECLOP_2VAR_POSTOP(long2, --)
+
+DECLOP_2VAR_COMP(long2, ==)
+DECLOP_2VAR_COMP(long2, !=)
+DECLOP_2VAR_COMP(long2, <)
+DECLOP_2VAR_COMP(long2, >)
+DECLOP_2VAR_COMP(long2, <=)
+DECLOP_2VAR_COMP(long2, >=)
+
+DECLOP_2VAR_COMP(long2, &&)
+DECLOP_2VAR_COMP(long2, ||)
+
+DECLOP_2VAR_1IN_1OUT(long2, ~)
+DECLOP_2VAR_1IN_BOOLOUT(long2, !)
+
+DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(long2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(long2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(long2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(long2, float)
+DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(long2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(long2, double)
+DECLOP_2VAR_SCALE_PRODUCT(long2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(long2, signed long long)
+
+// SIGNED LONG3
+
+DECLOP_3VAR_2IN_1OUT(long3, +)
+DECLOP_3VAR_2IN_1OUT(long3, -)
+DECLOP_3VAR_2IN_1OUT(long3, *)
+DECLOP_3VAR_2IN_1OUT(long3, /)
+DECLOP_3VAR_2IN_1OUT(long3, %)
+DECLOP_3VAR_2IN_1OUT(long3, &)
+DECLOP_3VAR_2IN_1OUT(long3, |)
+DECLOP_3VAR_2IN_1OUT(long3, ^)
+DECLOP_3VAR_2IN_1OUT(long3, <<)
+DECLOP_3VAR_2IN_1OUT(long3, >>)
+
+DECLOP_3VAR_ASSIGN(long3, +=)
+DECLOP_3VAR_ASSIGN(long3, -=)
+DECLOP_3VAR_ASSIGN(long3, *=)
+DECLOP_3VAR_ASSIGN(long3, /=)
+DECLOP_3VAR_ASSIGN(long3, %=)
+DECLOP_3VAR_ASSIGN(long3, &=)
+DECLOP_3VAR_ASSIGN(long3, |=)
+DECLOP_3VAR_ASSIGN(long3, ^=)
+DECLOP_3VAR_ASSIGN(long3, <<=)
+DECLOP_3VAR_ASSIGN(long3, >>=)
+
+DECLOP_3VAR_PREOP(long3, ++)
+DECLOP_3VAR_PREOP(long3, --)
+
+DECLOP_3VAR_POSTOP(long3, ++)
+DECLOP_3VAR_POSTOP(long3, --)
+
+DECLOP_3VAR_COMP(long3, ==)
+DECLOP_3VAR_COMP(long3, !=)
+DECLOP_3VAR_COMP(long3, <)
+DECLOP_3VAR_COMP(long3, >)
+DECLOP_3VAR_COMP(long3, <=)
+DECLOP_3VAR_COMP(long3, >=)
+
+DECLOP_3VAR_COMP(long3, &&)
+DECLOP_3VAR_COMP(long3, ||)
+
+DECLOP_3VAR_1IN_1OUT(long3, ~)
+DECLOP_3VAR_1IN_BOOLOUT(long3, !)
+
+DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(long3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(long3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(long3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(long3, float)
+DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(long3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(long3, double)
+DECLOP_3VAR_SCALE_PRODUCT(long3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(long3, signed long long)
+
+// SIGNED LONG4
+
+DECLOP_4VAR_2IN_1OUT(long4, +)
+DECLOP_4VAR_2IN_1OUT(long4, -)
+DECLOP_4VAR_2IN_1OUT(long4, *)
+DECLOP_4VAR_2IN_1OUT(long4, /)
+DECLOP_4VAR_2IN_1OUT(long4, %)
+DECLOP_4VAR_2IN_1OUT(long4, &)
+DECLOP_4VAR_2IN_1OUT(long4, |)
+DECLOP_4VAR_2IN_1OUT(long4, ^)
+DECLOP_4VAR_2IN_1OUT(long4, <<)
+DECLOP_4VAR_2IN_1OUT(long4, >>)
+
+DECLOP_4VAR_ASSIGN(long4, +=)
+DECLOP_4VAR_ASSIGN(long4, -=)
+DECLOP_4VAR_ASSIGN(long4, *=)
+DECLOP_4VAR_ASSIGN(long4, /=)
+DECLOP_4VAR_ASSIGN(long4, %=)
+DECLOP_4VAR_ASSIGN(long4, &=)
+DECLOP_4VAR_ASSIGN(long4, |=)
+DECLOP_4VAR_ASSIGN(long4, ^=)
+DECLOP_4VAR_ASSIGN(long4, <<=)
+DECLOP_4VAR_ASSIGN(long4, >>=)
+
+DECLOP_4VAR_PREOP(long4, ++)
+DECLOP_4VAR_PREOP(long4, --)
+
+DECLOP_4VAR_POSTOP(long4, ++)
+DECLOP_4VAR_POSTOP(long4, --)
+
+DECLOP_4VAR_COMP(long4, ==)
+DECLOP_4VAR_COMP(long4, !=)
+DECLOP_4VAR_COMP(long4, <)
+DECLOP_4VAR_COMP(long4, >)
+DECLOP_4VAR_COMP(long4, <=)
+DECLOP_4VAR_COMP(long4, >=)
+
+DECLOP_4VAR_COMP(long4, &&)
+DECLOP_4VAR_COMP(long4, ||)
+
+DECLOP_4VAR_1IN_1OUT(long4, ~)
+DECLOP_4VAR_1IN_BOOLOUT(long4, !)
+
+DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(long4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(long4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(long4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(long4, float)
+DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(long4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(long4, double)
+DECLOP_4VAR_SCALE_PRODUCT(long4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(long4, signed long long)
+
+// UNSIGNED LONGLONG1
+
+DECLOP_1VAR_2IN_1OUT(ulonglong1, +)
+DECLOP_1VAR_2IN_1OUT(ulonglong1, -)
+DECLOP_1VAR_2IN_1OUT(ulonglong1, *)
+DECLOP_1VAR_2IN_1OUT(ulonglong1, /)
+DECLOP_1VAR_2IN_1OUT(ulonglong1, %)
+DECLOP_1VAR_2IN_1OUT(ulonglong1, &)
+DECLOP_1VAR_2IN_1OUT(ulonglong1, |)
+DECLOP_1VAR_2IN_1OUT(ulonglong1, ^)
+DECLOP_1VAR_2IN_1OUT(ulonglong1, <<)
+DECLOP_1VAR_2IN_1OUT(ulonglong1, >>)
+
+
+DECLOP_1VAR_ASSIGN(ulonglong1, +=)
+DECLOP_1VAR_ASSIGN(ulonglong1, -=)
+DECLOP_1VAR_ASSIGN(ulonglong1, *=)
+DECLOP_1VAR_ASSIGN(ulonglong1, /=)
+DECLOP_1VAR_ASSIGN(ulonglong1, %=)
+DECLOP_1VAR_ASSIGN(ulonglong1, &=)
+DECLOP_1VAR_ASSIGN(ulonglong1, |=)
+DECLOP_1VAR_ASSIGN(ulonglong1, ^=)
+DECLOP_1VAR_ASSIGN(ulonglong1, <<=)
+DECLOP_1VAR_ASSIGN(ulonglong1, >>=)
+
+DECLOP_1VAR_PREOP(ulonglong1, ++)
+DECLOP_1VAR_PREOP(ulonglong1, --)
+
+DECLOP_1VAR_POSTOP(ulonglong1, ++)
+DECLOP_1VAR_POSTOP(ulonglong1, --)
+
+DECLOP_1VAR_COMP(ulonglong1, ==)
+DECLOP_1VAR_COMP(ulonglong1, !=)
+DECLOP_1VAR_COMP(ulonglong1, <)
+DECLOP_1VAR_COMP(ulonglong1, >)
+DECLOP_1VAR_COMP(ulonglong1, <=)
+DECLOP_1VAR_COMP(ulonglong1, >=)
+
+DECLOP_1VAR_COMP(ulonglong1, &&)
+DECLOP_1VAR_COMP(ulonglong1, ||)
+
+DECLOP_1VAR_1IN_1OUT(ulonglong1, ~)
+DECLOP_1VAR_1IN_BOOLOUT(ulonglong1, !)
+
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, float)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, double)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(ulonglong1, signed long long)
+
+// UNSIGNED LONGLONG2
+
+DECLOP_2VAR_2IN_1OUT(ulonglong2, +)
+DECLOP_2VAR_2IN_1OUT(ulonglong2, -)
+DECLOP_2VAR_2IN_1OUT(ulonglong2, *)
+DECLOP_2VAR_2IN_1OUT(ulonglong2, /)
+DECLOP_2VAR_2IN_1OUT(ulonglong2, %)
+DECLOP_2VAR_2IN_1OUT(ulonglong2, &)
+DECLOP_2VAR_2IN_1OUT(ulonglong2, |)
+DECLOP_2VAR_2IN_1OUT(ulonglong2, ^)
+DECLOP_2VAR_2IN_1OUT(ulonglong2, <<)
+DECLOP_2VAR_2IN_1OUT(ulonglong2, >>)
+
+DECLOP_2VAR_ASSIGN(ulonglong2, +=)
+DECLOP_2VAR_ASSIGN(ulonglong2, -=)
+DECLOP_2VAR_ASSIGN(ulonglong2, *=)
+DECLOP_2VAR_ASSIGN(ulonglong2, /=)
+DECLOP_2VAR_ASSIGN(ulonglong2, %=)
+DECLOP_2VAR_ASSIGN(ulonglong2, &=)
+DECLOP_2VAR_ASSIGN(ulonglong2, |=)
+DECLOP_2VAR_ASSIGN(ulonglong2, ^=)
+DECLOP_2VAR_ASSIGN(ulonglong2, <<=)
+DECLOP_2VAR_ASSIGN(ulonglong2, >>=)
+
+DECLOP_2VAR_PREOP(ulonglong2, ++)
+DECLOP_2VAR_PREOP(ulonglong2, --)
+
+DECLOP_2VAR_POSTOP(ulonglong2, ++)
+DECLOP_2VAR_POSTOP(ulonglong2, --)
+
+DECLOP_2VAR_COMP(ulonglong2, ==)
+DECLOP_2VAR_COMP(ulonglong2, !=)
+DECLOP_2VAR_COMP(ulonglong2, <)
+DECLOP_2VAR_COMP(ulonglong2, >)
+DECLOP_2VAR_COMP(ulonglong2, <=)
+DECLOP_2VAR_COMP(ulonglong2, >=)
+
+DECLOP_2VAR_COMP(ulonglong2, &&)
+DECLOP_2VAR_COMP(ulonglong2, ||)
+
+DECLOP_2VAR_1IN_1OUT(ulonglong2, ~)
+DECLOP_2VAR_1IN_BOOLOUT(ulonglong2, !)
+
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, float)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, double)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(ulonglong2, signed long long)
+
+// UNSIGNED LONGLONG3
+
+DECLOP_3VAR_2IN_1OUT(ulonglong3, +)
+DECLOP_3VAR_2IN_1OUT(ulonglong3, -)
+DECLOP_3VAR_2IN_1OUT(ulonglong3, *)
+DECLOP_3VAR_2IN_1OUT(ulonglong3, /)
+DECLOP_3VAR_2IN_1OUT(ulonglong3, %)
+DECLOP_3VAR_2IN_1OUT(ulonglong3, &)
+DECLOP_3VAR_2IN_1OUT(ulonglong3, |)
+DECLOP_3VAR_2IN_1OUT(ulonglong3, ^)
+DECLOP_3VAR_2IN_1OUT(ulonglong3, <<)
+DECLOP_3VAR_2IN_1OUT(ulonglong3, >>)
+
+DECLOP_3VAR_ASSIGN(ulonglong3, +=)
+DECLOP_3VAR_ASSIGN(ulonglong3, -=)
+DECLOP_3VAR_ASSIGN(ulonglong3, *=)
+DECLOP_3VAR_ASSIGN(ulonglong3, /=)
+DECLOP_3VAR_ASSIGN(ulonglong3, %=)
+DECLOP_3VAR_ASSIGN(ulonglong3, &=)
+DECLOP_3VAR_ASSIGN(ulonglong3, |=)
+DECLOP_3VAR_ASSIGN(ulonglong3, ^=)
+DECLOP_3VAR_ASSIGN(ulonglong3, <<=)
+DECLOP_3VAR_ASSIGN(ulonglong3, >>=)
+
+DECLOP_3VAR_PREOP(ulonglong3, ++)
+DECLOP_3VAR_PREOP(ulonglong3, --)
+
+DECLOP_3VAR_POSTOP(ulonglong3, ++)
+DECLOP_3VAR_POSTOP(ulonglong3, --)
+
+DECLOP_3VAR_COMP(ulonglong3, ==)
+DECLOP_3VAR_COMP(ulonglong3, !=)
+DECLOP_3VAR_COMP(ulonglong3, <)
+DECLOP_3VAR_COMP(ulonglong3, >)
+DECLOP_3VAR_COMP(ulonglong3, <=)
+DECLOP_3VAR_COMP(ulonglong3, >=)
+
+DECLOP_3VAR_COMP(ulonglong3, &&)
+DECLOP_3VAR_COMP(ulonglong3, ||)
+
+DECLOP_3VAR_1IN_1OUT(ulonglong3, ~)
+DECLOP_3VAR_1IN_BOOLOUT(ulonglong3, !)
+
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, float)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, double)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(ulonglong3, signed long long)
+
+// UNSIGNED LONGLONG4
+
+DECLOP_4VAR_2IN_1OUT(ulonglong4, +)
+DECLOP_4VAR_2IN_1OUT(ulonglong4, -)
+DECLOP_4VAR_2IN_1OUT(ulonglong4, *)
+DECLOP_4VAR_2IN_1OUT(ulonglong4, /)
+DECLOP_4VAR_2IN_1OUT(ulonglong4, %)
+DECLOP_4VAR_2IN_1OUT(ulonglong4, &)
+DECLOP_4VAR_2IN_1OUT(ulonglong4, |)
+DECLOP_4VAR_2IN_1OUT(ulonglong4, ^)
+DECLOP_4VAR_2IN_1OUT(ulonglong4, <<)
+DECLOP_4VAR_2IN_1OUT(ulonglong4, >>)
+
+DECLOP_4VAR_ASSIGN(ulonglong4, +=)
+DECLOP_4VAR_ASSIGN(ulonglong4, -=)
+DECLOP_4VAR_ASSIGN(ulonglong4, *=)
+DECLOP_4VAR_ASSIGN(ulonglong4, /=)
+DECLOP_4VAR_ASSIGN(ulonglong4, %=)
+DECLOP_4VAR_ASSIGN(ulonglong4, &=)
+DECLOP_4VAR_ASSIGN(ulonglong4, |=)
+DECLOP_4VAR_ASSIGN(ulonglong4, ^=)
+DECLOP_4VAR_ASSIGN(ulonglong4, <<=)
+DECLOP_4VAR_ASSIGN(ulonglong4, >>=)
+
+DECLOP_4VAR_PREOP(ulonglong4, ++)
+DECLOP_4VAR_PREOP(ulonglong4, --)
+
+DECLOP_4VAR_POSTOP(ulonglong4, ++)
+DECLOP_4VAR_POSTOP(ulonglong4, --)
+
+DECLOP_4VAR_COMP(ulonglong4, ==)
+DECLOP_4VAR_COMP(ulonglong4, !=)
+DECLOP_4VAR_COMP(ulonglong4, <)
+DECLOP_4VAR_COMP(ulonglong4, >)
+DECLOP_4VAR_COMP(ulonglong4, <=)
+DECLOP_4VAR_COMP(ulonglong4, >=)
+
+DECLOP_4VAR_COMP(ulonglong4, &&)
+DECLOP_4VAR_COMP(ulonglong4, ||)
+
+DECLOP_4VAR_1IN_1OUT(ulonglong4, ~)
+DECLOP_4VAR_1IN_BOOLOUT(ulonglong4, !)
+
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, float)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, double)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(ulonglong4, signed long long)
+
+// SIGNED LONGLONG1
+
+DECLOP_1VAR_2IN_1OUT(longlong1, +)
+DECLOP_1VAR_2IN_1OUT(longlong1, -)
+DECLOP_1VAR_2IN_1OUT(longlong1, *)
+DECLOP_1VAR_2IN_1OUT(longlong1, /)
+DECLOP_1VAR_2IN_1OUT(longlong1, %)
+DECLOP_1VAR_2IN_1OUT(longlong1, &)
+DECLOP_1VAR_2IN_1OUT(longlong1, |)
+DECLOP_1VAR_2IN_1OUT(longlong1, ^)
+DECLOP_1VAR_2IN_1OUT(longlong1, <<)
+DECLOP_1VAR_2IN_1OUT(longlong1, >>)
+
+
+DECLOP_1VAR_ASSIGN(longlong1, +=)
+DECLOP_1VAR_ASSIGN(longlong1, -=)
+DECLOP_1VAR_ASSIGN(longlong1, *=)
+DECLOP_1VAR_ASSIGN(longlong1, /=)
+DECLOP_1VAR_ASSIGN(longlong1, %=)
+DECLOP_1VAR_ASSIGN(longlong1, &=)
+DECLOP_1VAR_ASSIGN(longlong1, |=)
+DECLOP_1VAR_ASSIGN(longlong1, ^=)
+DECLOP_1VAR_ASSIGN(longlong1, <<=)
+DECLOP_1VAR_ASSIGN(longlong1, >>=)
+
+DECLOP_1VAR_PREOP(longlong1, ++)
+DECLOP_1VAR_PREOP(longlong1, --)
+
+DECLOP_1VAR_POSTOP(longlong1, ++)
+DECLOP_1VAR_POSTOP(longlong1, --)
+
+DECLOP_1VAR_COMP(longlong1, ==)
+DECLOP_1VAR_COMP(longlong1, !=)
+DECLOP_1VAR_COMP(longlong1, <)
+DECLOP_1VAR_COMP(longlong1, >)
+DECLOP_1VAR_COMP(longlong1, <=)
+DECLOP_1VAR_COMP(longlong1, >=)
+
+DECLOP_1VAR_COMP(longlong1, &&)
+DECLOP_1VAR_COMP(longlong1, ||)
+
+DECLOP_1VAR_1IN_1OUT(longlong1, ~)
+DECLOP_1VAR_1IN_BOOLOUT(longlong1, !)
+
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned char)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed char)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned short)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed short)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned int)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed int)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, float)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, double)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, unsigned long long)
+DECLOP_1VAR_SCALE_PRODUCT(longlong1, signed long long)
+
+// SIGNED LONGLONG2
+
+DECLOP_2VAR_2IN_1OUT(longlong2, +)
+DECLOP_2VAR_2IN_1OUT(longlong2, -)
+DECLOP_2VAR_2IN_1OUT(longlong2, *)
+DECLOP_2VAR_2IN_1OUT(longlong2, /)
+DECLOP_2VAR_2IN_1OUT(longlong2, %)
+DECLOP_2VAR_2IN_1OUT(longlong2, &)
+DECLOP_2VAR_2IN_1OUT(longlong2, |)
+DECLOP_2VAR_2IN_1OUT(longlong2, ^)
+DECLOP_2VAR_2IN_1OUT(longlong2, <<)
+DECLOP_2VAR_2IN_1OUT(longlong2, >>)
+
+DECLOP_2VAR_ASSIGN(longlong2, +=)
+DECLOP_2VAR_ASSIGN(longlong2, -=)
+DECLOP_2VAR_ASSIGN(longlong2, *=)
+DECLOP_2VAR_ASSIGN(longlong2, /=)
+DECLOP_2VAR_ASSIGN(longlong2, %=)
+DECLOP_2VAR_ASSIGN(longlong2, &=)
+DECLOP_2VAR_ASSIGN(longlong2, |=)
+DECLOP_2VAR_ASSIGN(longlong2, ^=)
+DECLOP_2VAR_ASSIGN(longlong2, <<=)
+DECLOP_2VAR_ASSIGN(longlong2, >>=)
+
+DECLOP_2VAR_PREOP(longlong2, ++)
+DECLOP_2VAR_PREOP(longlong2, --)
+
+DECLOP_2VAR_POSTOP(longlong2, ++)
+DECLOP_2VAR_POSTOP(longlong2, --)
+
+DECLOP_2VAR_COMP(longlong2, ==)
+DECLOP_2VAR_COMP(longlong2, !=)
+DECLOP_2VAR_COMP(longlong2, <)
+DECLOP_2VAR_COMP(longlong2, >)
+DECLOP_2VAR_COMP(longlong2, <=)
+DECLOP_2VAR_COMP(longlong2, >=)
+
+DECLOP_2VAR_COMP(longlong2, &&)
+DECLOP_2VAR_COMP(longlong2, ||)
+
+DECLOP_2VAR_1IN_1OUT(longlong2, ~)
+DECLOP_2VAR_1IN_BOOLOUT(longlong2, !)
+
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned char)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed char)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned short)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed short)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned int)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed int)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, float)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, double)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, unsigned long long)
+DECLOP_2VAR_SCALE_PRODUCT(longlong2, signed long long)
+
+// SIGNED LONGLONG3
+
+DECLOP_3VAR_2IN_1OUT(longlong3, +)
+DECLOP_3VAR_2IN_1OUT(longlong3, -)
+DECLOP_3VAR_2IN_1OUT(longlong3, *)
+DECLOP_3VAR_2IN_1OUT(longlong3, /)
+DECLOP_3VAR_2IN_1OUT(longlong3, %)
+DECLOP_3VAR_2IN_1OUT(longlong3, &)
+DECLOP_3VAR_2IN_1OUT(longlong3, |)
+DECLOP_3VAR_2IN_1OUT(longlong3, ^)
+DECLOP_3VAR_2IN_1OUT(longlong3, <<)
+DECLOP_3VAR_2IN_1OUT(longlong3, >>)
+
+DECLOP_3VAR_ASSIGN(longlong3, +=)
+DECLOP_3VAR_ASSIGN(longlong3, -=)
+DECLOP_3VAR_ASSIGN(longlong3, *=)
+DECLOP_3VAR_ASSIGN(longlong3, /=)
+DECLOP_3VAR_ASSIGN(longlong3, %=)
+DECLOP_3VAR_ASSIGN(longlong3, &=)
+DECLOP_3VAR_ASSIGN(longlong3, |=)
+DECLOP_3VAR_ASSIGN(longlong3, ^=)
+DECLOP_3VAR_ASSIGN(longlong3, <<=)
+DECLOP_3VAR_ASSIGN(longlong3, >>=)
+
+DECLOP_3VAR_PREOP(longlong3, ++)
+DECLOP_3VAR_PREOP(longlong3, --)
+
+DECLOP_3VAR_POSTOP(longlong3, ++)
+DECLOP_3VAR_POSTOP(longlong3, --)
+
+DECLOP_3VAR_COMP(longlong3, ==)
+DECLOP_3VAR_COMP(longlong3, !=)
+DECLOP_3VAR_COMP(longlong3, <)
+DECLOP_3VAR_COMP(longlong3, >)
+DECLOP_3VAR_COMP(longlong3, <=)
+DECLOP_3VAR_COMP(longlong3, >=)
+
+DECLOP_3VAR_COMP(longlong3, &&)
+DECLOP_3VAR_COMP(longlong3, ||)
+
+DECLOP_3VAR_1IN_1OUT(longlong3, ~)
+DECLOP_3VAR_1IN_BOOLOUT(longlong3, !)
+
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned char)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed char)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned short)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed short)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned int)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed int)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, float)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, double)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, unsigned long long)
+DECLOP_3VAR_SCALE_PRODUCT(longlong3, signed long long)
+
+// SIGNED LONGLONG4
+
+DECLOP_4VAR_2IN_1OUT(longlong4, +)
+DECLOP_4VAR_2IN_1OUT(longlong4, -)
+DECLOP_4VAR_2IN_1OUT(longlong4, *)
+DECLOP_4VAR_2IN_1OUT(longlong4, /)
+DECLOP_4VAR_2IN_1OUT(longlong4, %)
+DECLOP_4VAR_2IN_1OUT(longlong4, &)
+DECLOP_4VAR_2IN_1OUT(longlong4, |)
+DECLOP_4VAR_2IN_1OUT(longlong4, ^)
+DECLOP_4VAR_2IN_1OUT(longlong4, <<)
+DECLOP_4VAR_2IN_1OUT(longlong4, >>)
+
+DECLOP_4VAR_ASSIGN(longlong4, +=)
+DECLOP_4VAR_ASSIGN(longlong4, -=)
+DECLOP_4VAR_ASSIGN(longlong4, *=)
+DECLOP_4VAR_ASSIGN(longlong4, /=)
+DECLOP_4VAR_ASSIGN(longlong4, %=)
+DECLOP_4VAR_ASSIGN(longlong4, &=)
+DECLOP_4VAR_ASSIGN(longlong4, |=)
+DECLOP_4VAR_ASSIGN(longlong4, ^=)
+DECLOP_4VAR_ASSIGN(longlong4, <<=)
+DECLOP_4VAR_ASSIGN(longlong4, >>=)
+
+DECLOP_4VAR_PREOP(longlong4, ++)
+DECLOP_4VAR_PREOP(longlong4, --)
+
+DECLOP_4VAR_POSTOP(longlong4, ++)
+DECLOP_4VAR_POSTOP(longlong4, --)
+
+DECLOP_4VAR_COMP(longlong4, ==)
+DECLOP_4VAR_COMP(longlong4, !=)
+DECLOP_4VAR_COMP(longlong4, <)
+DECLOP_4VAR_COMP(longlong4, >)
+DECLOP_4VAR_COMP(longlong4, <=)
+DECLOP_4VAR_COMP(longlong4, >=)
+
+DECLOP_4VAR_COMP(longlong4, &&)
+DECLOP_4VAR_COMP(longlong4, ||)
+
+DECLOP_4VAR_1IN_1OUT(longlong4, ~)
+DECLOP_4VAR_1IN_BOOLOUT(longlong4, !)
+
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned char)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed char)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned short)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed short)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned int)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed int)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, float)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, double)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, unsigned long long)
+DECLOP_4VAR_SCALE_PRODUCT(longlong4, signed long long)
+
+
 #endif
 
+#endif
diff --git a/src/device_util.cpp b/src/device_util.cpp
index 5452e1c905..669fcb7570 100644
--- a/src/device_util.cpp
+++ b/src/device_util.cpp
@@ -14,7 +14,7 @@ all copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
@@ -129,34 +129,34 @@ __device__ int __hip_move_dpp(int src, int dpp_ctrl, int row_mask, int bank_mask
 
 __device__ char4 __hip_hc_add8pk(char4 in1, char4 in2) {
     char4 out;
-    unsigned one1 = in1.val & MASK1;
-    unsigned one2 = in2.val & MASK1;
-    out.val = (one1 + one2) & MASK1;
-    one1 = in1.val & MASK2;
-    one2 = in2.val & MASK2;
-    out.val = out.val | ((one1 + one2) & MASK2);
+    unsigned one1 = in1.a & MASK1;
+    unsigned one2 = in2.a & MASK1;
+    out.a = (one1 + one2) & MASK1;
+    one1 = in1.a & MASK2;
+    one2 = in2.a & MASK2;
+    out.a = out.a | ((one1 + one2) & MASK2);
     return out;
 }
 
 __device__ char4 __hip_hc_sub8pk(char4 in1, char4 in2) {
     char4 out;
-    unsigned one1 = in1.val & MASK1;
-    unsigned one2 = in2.val & MASK1;
-    out.val = (one1 - one2) & MASK1;
-    one1 = in1.val & MASK2;
-    one2 = in2.val & MASK2;
-    out.val = out.val | ((one1 - one2) & MASK2);
+    unsigned one1 = in1.a & MASK1;
+    unsigned one2 = in2.a & MASK1;
+    out.a = (one1 - one2) & MASK1;
+    one1 = in1.a & MASK2;
+    one2 = in2.a & MASK2;
+    out.a = out.a | ((one1 - one2) & MASK2);
     return out;
 }
 
 __device__ char4 __hip_hc_mul8pk(char4 in1, char4 in2) {
     char4 out;
-    unsigned one1 = in1.val & MASK1;
-    unsigned one2 = in2.val & MASK1;
-    out.val = (one1 * one2) & MASK1;
-    one1 = in1.val & MASK2;
-    one2 = in2.val & MASK2;
-    out.val = out.val | ((one1 * one2) & MASK2);
+    unsigned one1 = in1.a & MASK1;
+    unsigned one2 = in2.a & MASK1;
+    out.a = (one1 * one2) & MASK1;
+    one1 = in1.a & MASK2;
+    one2 = in2.a & MASK2;
+    out.a = out.a | ((one1 * one2) & MASK2);
     return out;
 }
 
@@ -2179,426 +2179,17 @@ __device__ double __hip_fast_dsqrt_rz(double x) {
   return hc::fast_math::sqrt(x);
 }
 
-__HIP_DEVICE__ char1 make_char1(signed char x)
-{
-    char1 c1;
-    c1.x = x;
-    return c1;
-}
-
-__HIP_DEVICE__ char2 make_char2(signed char x, signed char y)
-{
-    char2 c2;
-    c2.x = x;
-    c2.y = y;
-    return c2;
-}
-
-__HIP_DEVICE__ char3 make_char3(signed char x, signed char y, signed char z)
-{
-    char3 c3;
-    c3.x = x;
-    c3.y = y;
-    c3.z = z;
-    return c3;
-}
-
-__HIP_DEVICE__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
-{
-    char4 c4;
-    c4.x = x;
-    c4.y = y;
-    c4.z = z;
-    c4.w = w;
-    return c4;
-}
-
-__HIP_DEVICE__ short1 make_short1(short x)
-{
-    short1 s1;
-    s1.x = x;
-    return s1;
-}
-
-__HIP_DEVICE__ short2 make_short2(short x, short y)
-{
-    short2 s2;
-    s2.x = x;
-    s2.y = y;
-    return s2;
-}
-
-__HIP_DEVICE__ short3 make_short3(short x, short y, short z)
-{
-    short3 s3;
-    s3.x = x;
-    s3.y = y;
-    s3.z = z;
-    return s3;
-}
-
-__HIP_DEVICE__ short4 make_short4(short x, short y, short z, short w)
-{
-    short4 s4;
-    s4.x = x;
-    s4.y = y;
-    s4.z = z;
-    s4.w = w;
-    return s4;
-}
-
-__HIP_DEVICE__ int1 make_int1(int x)
-{
-    int1 i1;
-    i1.x = x;
-    return i1;
-}
-
-__HIP_DEVICE__ int2 make_int2(int x, int y)
-{
-    int2 i2;
-    i2.x = x;
-    i2.y = y;
-    return i2;
-}
-
-__HIP_DEVICE__ int3 make_int3(int x, int y, int z)
-{
-    int3 i3;
-    i3.x = x;
-    i3.y = y;
-    i3.z = z;
-    return i3;
-}
-
-__HIP_DEVICE__ int4 make_int4(int x, int y, int z, int w)
-{
-    int4 i4;
-    i4.x = x;
-    i4.y = y;
-    i4.z = z;
-    i4.w = w;
-    return i4;
-}
-
-__HIP_DEVICE__ long1 make_long1(long x)
-{
-    long1 l1;
-    l1.x = x;
-    return l1;
-}
-
-__HIP_DEVICE__ long2 make_long2(long x, long y)
-{
-    long2 l2;
-    l2.x = x;
-    l2.y = y;
-    return l2;
-}
-
-__HIP_DEVICE__ long3 make_long3(long x, long y, long z)
-{
-    long3 l3;
-    l3.x = x;
-    l3.y = y;
-    l3.z = z;
-    return l3;
-}
-
-__HIP_DEVICE__ long4 make_long4(long x, long y, long z, long w)
-{
-    long4 l4;
-    l4.x = x;
-    l4.y = y;
-    l4.z = z;
-    l4.w = w;
-    return l4;
-}
-
-__HIP_DEVICE__ longlong1 make_longlong1(long long x)
-{
-    longlong1 l1;
-    l1.x = x;
-    return l1;
-}
-
-__HIP_DEVICE__ longlong2 make_longlong2(long long x, long long y)
-{
-    longlong2 l2;
-    l2.x = x;
-    l2.y = y;
-    return l2;
-}
-
-__HIP_DEVICE__ longlong3 make_longlong3(long long x, long long y, long long z)
-{
-    longlong3 l3;
-    l3.x = x;
-    l3.y = y;
-    l3.z = z;
-    return l3;
-}
-
-__HIP_DEVICE__ longlong4 make_longlong4(long long x, long long y, long long z, long long w)
-{
-    longlong4 l4;
-    l4.x = x;
-    l4.y = y;
-    l4.z = z;
-    l4.w = w;
-    return l4;
-}
-
-__HIP_DEVICE__ uchar1 make_uchar1(unsigned char x)
-{
-    uchar1 c1;
-    c1.x = x;
-    return c1;
-}
-
-__HIP_DEVICE__ uchar2 make_uchar2(unsigned char x, unsigned char y)
-{
-    uchar2 c2;
-    c2.x = x;
-    c2.y = y;
-    return c2;
-}
-
-__HIP_DEVICE__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
-{
-    uchar3 c3;
-    c3.x = x;
-    c3.y = y;
-    c3.z = z;
-    return c3;
-}
-
-__HIP_DEVICE__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
-{
-    uchar4 c4;
-    c4.x = x;
-    c4.y = y;
-    c4.z = z;
-    c4.w = w;
-    return c4;
-}
-
-__HIP_DEVICE__ ushort1 make_ushort1(unsigned short x)
-{
-    ushort1 s1;
-    s1.x = x;
-    return s1;
-}
-
-__HIP_DEVICE__ ushort2 make_ushort2(unsigned short x, unsigned short y)
-{
-    ushort2 s2;
-    s2.x = x;
-    s2.y = y;
-    return s2;
-}
-
-__HIP_DEVICE__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
-{
-    ushort3 s3;
-    s3.x = x;
-    s3.y = y;
-    s3.z = z;
-    return s3;
-}
-
-__HIP_DEVICE__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
-{
-    ushort4 s4;
-    s4.x = x;
-    s4.y = y;
-    s4.z = z;
-    s4.w = w;
-    return s4;
-}
-
-__HIP_DEVICE__ uint1 make_uint1(unsigned int x)
-{
-    uint1 i1;
-    i1.x = x;
-    return i1;
-}
-
-__HIP_DEVICE__ uint2 make_uint2(unsigned int x, unsigned int y)
-{
-    uint2 i2;
-    i2.x = x;
-    i2.y = y;
-    return i2;
-}
-
-__HIP_DEVICE__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
-{
-    uint3 i3;
-    i3.x = x;
-    i3.y = y;
-    i3.z = z;
-    return i3;
-}
-
-__HIP_DEVICE__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
-{
-    uint4 i4;
-    i4.x = x;
-    i4.y = y;
-    i4.z = z;
-    i4.w = w;
-    return i4;
-}
-
-__HIP_DEVICE__ ulong1 make_ulong1(unsigned long x)
-{
-    ulong1 l1;
-    l1.x = x;
-    return l1;
-}
-
-__HIP_DEVICE__ ulong2 make_ulong2(unsigned long x, unsigned long y)
-{
-    ulong2 l2;
-    l2.x = x;
-    l2.y = y;
-    return l2;
-}
-
-__HIP_DEVICE__ ulong3 make_ulong3(unsigned long x, unsigned long y, unsigned long z)
-{
-    ulong3 l3;
-    l3.x = x;
-    l3.y = y;
-    l3.z = z;
-    return l3;
-}
-
-__HIP_DEVICE__ ulong4 make_ulong4(unsigned long x, unsigned long y, unsigned long z, unsigned long w)
-{
-    ulong4 l4;
-    l4.x = x;
-    l4.y = y;
-    l4.z = z;
-    l4.w = w;
-    return l4;
-}
-
-__HIP_DEVICE__ ulonglong1 make_ulonglong1(unsigned long long x)
-{
-    ulonglong1 l1;
-    l1.x = x;
-    return l1;
-}
-
-__HIP_DEVICE__ ulonglong2 make_ulonglong2(unsigned long long x, unsigned long long y)
-{
-    ulonglong2 l2;
-    l2.x = x;
-    l2.y = y;
-    return l2;
-}
-
-__HIP_DEVICE__ ulonglong3 make_ulonglong3(unsigned long long x, unsigned long long y, unsigned long long z)
-{
-    ulonglong3 l3;
-    l3.x = x;
-    l3.y = y;
-    l3.z = z;
-    return l3;
-}
-
-__HIP_DEVICE__ ulonglong4 make_ulonglong4(unsigned long long x, unsigned long long y, unsigned long long z, unsigned long long w)
-{
-    ulonglong4 l4;
-    l4.x = x;
-    l4.y = y;
-    l4.z = z;
-    l4.w = w;
-    return l4;
-}
-
-__HIP_DEVICE__ float1 make_float1(float x)
-{
-    float1 f1;
-    f1.x = x;
-    return f1;
-}
-
-__HIP_DEVICE__ float2 make_float2(float x, float y)
-{
-    float2 f2;
-    f2.x = x;
-    f2.y = y;
-    return f2;
-}
-
-__HIP_DEVICE__ float3 make_float3(float x, float y, float z)
-{
-    float3 f3;
-    f3.x = x;
-    f3.y = y;
-    f3.z = z;
-    return f3;
-}
-
-__HIP_DEVICE__ float4 make_float4(float x, float y, float z, float w)
-{
-    float4 f4;
-    f4.x = x;
-    f4.y = y;
-    f4.z = z;
-    f4.w = w;
-    return f4;
-}
-
-__HIP_DEVICE__ double1 make_double1(double x)
-{
-    double1 d1;
-    d1.x = x;
-    return d1;
-}
-
-__HIP_DEVICE__ double2 make_double2(double x, double y)
-{
-    double2 d2;
-    d2.x = x;
-    d2.y = y;
-    return d2;
-}
-
-__HIP_DEVICE__ double3 make_double3(double x, double y, double z)
-{
-    double3 d3;
-    d3.x = x;
-    d3.y = y;
-    d3.z = z;
-    return d3;
-}
-
-__HIP_DEVICE__ double4 make_double4(double x, double y, double z, double w)
-{
-    double4 d4;
-    d4.x = x;
-    d4.y = y;
-    d4.z = z;
-    d4.w = w;
-    return d4;
-}
-
-
-__HIP_DEVICE__ double  __longlong_as_double(long long int x)
+__device__ double  __longlong_as_double(long long int x)
 {
   return static_cast<double>(x);
 }
 
-__HIP_DEVICE__ long long __double_as_longlong(double x)
+__device__ long long __double_as_longlong(double x)
 {
   return static_cast<long long>(x);
 }
 
-__HIP_DEVICE__ void  __threadfence_system(void){
+__device__ void  __threadfence_system(void){
     // no-op
 }
 
@@ -3391,5 +2982,3 @@ __host__ double norm4d(double a, double b, double c, double d)
 {
     return std::sqrt(a*a + b*b + c*c + d*d);
 }
-
-
diff --git a/src/hip_ldg.cpp b/src/hip_ldg.cpp
index f3e593355a..075e1926f1 100644
--- a/src/hip_ldg.cpp
+++ b/src/hip_ldg.cpp
@@ -20,23 +20,22 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 */
 
-#include <hc.hpp>
-
 #include "hip/hcc_detail/hip_ldg.h"
+#include "hip/hcc_detail/hip_vector_types.h"
 
 __device__ char                 __ldg(const char* ptr)
 {
-    return ptr[0];
+    return *ptr;
 }
 
 __device__ char2                __ldg(const char2* ptr)
 {
-    return ptr[0];
+    return *ptr;
 }
 
 __device__ char4                __ldg(const char4* ptr)
 {
-    return ptr[0];
+    return *ptr;
 }
 
 __device__ signed char          __ldg(const signed char* ptr)
@@ -169,6 +168,3 @@ __device__ double2              __ldg(const double2* ptr)
 {
     return ptr[0];
 }
-
-
-
diff --git a/tests/src/deviceLib/hip_test_ldg.cpp b/tests/src/deviceLib/hip_test_ldg.cpp
index 0737533175..171ff1afd0 100644
--- a/tests/src/deviceLib/hip_test_ldg.cpp
+++ b/tests/src/deviceLib/hip_test_ldg.cpp
@@ -32,6 +32,7 @@ THE SOFTWARE.
 #include <stdlib.h>
 #include<iostream>
 #include "hip/hip_runtime.h"
+#include "hip/hip_vector_types.h"
 #include "test_common.h"
 
 #if (__hcc_workweek__ >= 16164) || defined (__HIP_PLATFORM_NVCC__)
@@ -389,4 +390,3 @@ int main() {
 }
 
 #endif
-

From e30887dc694d130e434c8800daad8f6216def8ab Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 11 Jan 2017 17:53:32 -0600
Subject: [PATCH 11/18] fixed compilation issues with operator overloading
 device data types

Change-Id: I6a60282f0c04a3c0d382cdf2d67ad8d9156880ad
---
 include/hip/hcc_detail/hip_vector_types.h | 80 +++++++++++------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h
index 812bd272d0..e15da69b3f 100644
--- a/include/hip/hcc_detail/hip_vector_types.h
+++ b/include/hip/hcc_detail/hip_vector_types.h
@@ -1131,14 +1131,14 @@ struct longlong4 {
 } __attribute__((aligned(32)));
 
 #define DECLOP_MAKE_ONE_COMPONENT(comp, type) \
-__device__ __host__ inline type make_##type(comp x) { \
+__device__ __host__ static inline type make_##type(comp x) { \
   type ret; \
   ret.x = x; \
   return ret; \
 }
 
 #define DECLOP_MAKE_TWO_COMPONENT(comp, type) \
-__device__ __host__ inline type make_##type(comp x, comp y) { \
+__device__ __host__ static inline type make_##type(comp x, comp y) { \
   type ret; \
   ret.x = x; \
   ret.y = y; \
@@ -1146,7 +1146,7 @@ __device__ __host__ inline type make_##type(comp x, comp y) { \
 }
 
 #define DECLOP_MAKE_THREE_COMPONENT(comp, type) \
-__device__ __host__ inline type make_##type(comp x, comp y, comp z) { \
+__device__ __host__ static inline type make_##type(comp x, comp y, comp z) { \
   type ret; \
   ret.x = x; \
   ret.y = y; \
@@ -1155,7 +1155,7 @@ __device__ __host__ inline type make_##type(comp x, comp y, comp z) { \
 }
 
 #define DECLOP_MAKE_FOUR_COMPONENT(comp, type) \
-__device__ __host__ inline type make_##type(comp x, comp y, comp z, comp w) { \
+__device__ __host__ static inline type make_##type(comp x, comp y, comp z, comp w) { \
   type ret; \
   ret.x = x; \
   ret.y = y; \
@@ -1228,39 +1228,39 @@ DECLOP_MAKE_FOUR_COMPONENT(signed long, longlong4);
 #if __cplusplus
 
 #define DECLOP_1VAR_2IN_1OUT(type, op) \
-__device__ __host__ type operator op (const type& lhs, const type& rhs) { \
+__device__ __host__ static type operator op (const type& lhs, const type& rhs) { \
   type ret; \
   ret.x = lhs.x op rhs.x; \
   return ret; \
 }
 
 #define DECLOP_1VAR_SCALE_PRODUCT(type, type1) \
-__device__ __host__ type operator * (const type& lhs, type1 rhs) { \
+__device__ __host__ static type operator * (const type& lhs, type1 rhs) { \
   type ret; \
   ret.x = lhs.x * rhs; \
   return ret; \
 } \
 \
-__device__ __host__ type operator * (type1 lhs, const type& rhs) { \
+__device__ __host__ static type operator * (type1 lhs, const type& rhs) { \
   type ret; \
   ret.x = lhs * rhs.x; \
   return ret; \
 }
 
 #define DECLOP_1VAR_ASSIGN(type, op) \
-__device__ __host__ inline type& operator op ( type& lhs, const type& rhs) { \
+__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \
   lhs.x op rhs.x; \
   return lhs; \
 }
 
 #define DECLOP_1VAR_PREOP(type, op) \
-__device__ __host__ inline type& operator op (type& val) { \
+__device__ __host__ static inline type& operator op (type& val) { \
   op val.x; \
   return val; \
 }
 
 #define DECLOP_1VAR_POSTOP(type, op) \
-__device__ __host__ type operator op (type& val, int i) { \
+__device__ __host__ static type operator op (type& val, int i) { \
   type ret; \
   ret.x = val.x; \
   val.x op; \
@@ -1268,19 +1268,19 @@ __device__ __host__ type operator op (type& val, int i) { \
 }
 
 #define DECLOP_1VAR_COMP(type, op) \
-__device__ __host__ inline bool operator op (type& lhs, type& rhs) { \
+__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
   return lhs.x op rhs.x; \
 }
 
 #define DECLOP_1VAR_1IN_1OUT(type, op) \
-__device__ __host__ type operator op(type& rhs) { \
+__device__ __host__ static type operator op(type& rhs) { \
   type ret; \
   ret.x = op rhs.x; \
   return ret; \
 }
 
 #define DECLOP_1VAR_1IN_BOOLOUT(type, op) \
-__device__ __host__ inline bool operator op (type& rhs) { \
+__device__ __host__ static inline bool operator op (type& rhs) { \
   return op rhs.x; \
 }
 
@@ -1289,7 +1289,7 @@ __device__ __host__ inline bool operator op (type& rhs) { \
 */
 
 #define DECLOP_2VAR_2IN_1OUT(type, op) \
-__device__ __host__ type operator op (const type& lhs, const type& rhs) { \
+__device__ __host__ static type operator op (const type& lhs, const type& rhs) { \
   type ret; \
   ret.x = lhs.x op rhs.x; \
   ret.y = lhs.y op rhs.y; \
@@ -1297,14 +1297,14 @@ __device__ __host__ type operator op (const type& lhs, const type& rhs) { \
 }
 
 #define DECLOP_2VAR_SCALE_PRODUCT(type, type1) \
-__device__ __host__ type operator * (const type& lhs, type1 rhs) { \
+__device__ __host__ static type operator * (const type& lhs, type1 rhs) { \
   type ret; \
   ret.x = lhs.x * rhs; \
   ret.y = lhs.y * rhs; \
   return ret; \
 } \
 \
-__device__ __host__ type operator * (type1 lhs, const type& rhs) { \
+__device__ __host__ static type operator * (type1 lhs, const type& rhs) { \
   type ret; \
   ret.x = lhs * rhs.x; \
   ret.y = lhs * rhs.y; \
@@ -1312,21 +1312,21 @@ __device__ __host__ type operator * (type1 lhs, const type& rhs) { \
 }
 
 #define DECLOP_2VAR_ASSIGN(type, op) \
-__device__ __host__ inline type& operator op ( type& lhs, const type& rhs) { \
+__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \
   lhs.x op rhs.x; \
   lhs.y op rhs.y; \
   return lhs; \
 }
 
 #define DECLOP_2VAR_PREOP(type, op) \
-__device__ __host__ inline type& operator op (type& val) { \
+__device__ __host__ static inline type& operator op (type& val) { \
   op val.x; \
   op val.y; \
   return val; \
 }
 
 #define DECLOP_2VAR_POSTOP(type, op) \
-__device__ __host__ type operator op (type& val, int i) { \
+__device__ __host__ static type operator op (type& val, int i) { \
   type ret; \
   ret.x = val.x; \
   ret.y = val.y; \
@@ -1336,12 +1336,12 @@ __device__ __host__ type operator op (type& val, int i) { \
 }
 
 #define DECLOP_2VAR_COMP(type, op) \
-__device__ __host__ inline bool operator op (type& lhs, type& rhs) { \
+__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
   return lhs.x op rhs.x && lhs.y op rhs.y; \
 }
 
 #define DECLOP_2VAR_1IN_1OUT(type, op) \
-__device__ __host__ type operator op(type &rhs) { \
+__device__ __host__ static type operator op(type &rhs) { \
   type ret; \
   ret.x = op rhs.x; \
   ret.y = op rhs.y; \
@@ -1349,7 +1349,7 @@ __device__ __host__ type operator op(type &rhs) { \
 }
 
 #define DECLOP_2VAR_1IN_BOOLOUT(type, op) \
-__device__ __host__ inline bool operator op (type &rhs) { \
+__device__ __host__ static inline bool operator op (type &rhs) { \
   return op rhs.x && op rhs.y; \
 }
 
@@ -1359,7 +1359,7 @@ __device__ __host__ inline bool operator op (type &rhs) { \
 */
 
 #define DECLOP_3VAR_2IN_1OUT(type, op) \
-__device__ __host__ type operator op (const type& lhs, const type& rhs) { \
+__device__ __host__ static type operator op (const type& lhs, const type& rhs) { \
   type ret; \
   ret.x = lhs.x op rhs.x; \
   ret.y = lhs.y op rhs.y; \
@@ -1368,7 +1368,7 @@ __device__ __host__ type operator op (const type& lhs, const type& rhs) { \
 }
 
 #define DECLOP_3VAR_SCALE_PRODUCT(type, type1) \
-__device__ __host__ type operator * (const type& lhs, type1 rhs) { \
+__device__ __host__ static type operator * (const type& lhs, type1 rhs) { \
   type ret; \
   ret.x = lhs.x * rhs; \
   ret.y = lhs.y * rhs; \
@@ -1376,7 +1376,7 @@ __device__ __host__ type operator * (const type& lhs, type1 rhs) { \
   return ret; \
 } \
 \
-__device__ __host__ type operator * (type1 lhs, const type& rhs) { \
+__device__ __host__ static type operator * (type1 lhs, const type& rhs) { \
   type ret; \
   ret.x = lhs * rhs.x; \
   ret.y = lhs * rhs.y; \
@@ -1385,7 +1385,7 @@ __device__ __host__ type operator * (type1 lhs, const type& rhs) { \
 }
 
 #define DECLOP_3VAR_ASSIGN(type, op) \
-__device__ __host__ inline type& operator op ( type& lhs, const type& rhs) { \
+__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \
   lhs.x op rhs.x; \
   lhs.y op rhs.y; \
   lhs.z op rhs.z; \
@@ -1393,7 +1393,7 @@ __device__ __host__ inline type& operator op ( type& lhs, const type& rhs) { \
 }
 
 #define DECLOP_3VAR_PREOP(type, op) \
-__device__ __host__ inline type& operator op (type& val) { \
+__device__ __host__ static inline type& operator op (type& val) { \
   op val.x; \
   op val.y; \
   op val.z; \
@@ -1401,7 +1401,7 @@ __device__ __host__ inline type& operator op (type& val) { \
 }
 
 #define DECLOP_3VAR_POSTOP(type, op) \
-__device__ __host__ type operator op (type& val, int i) { \
+__device__ __host__ static type operator op (type& val, int i) { \
   type ret; \
   ret.x = val.x; \
   ret.y = val.y; \
@@ -1413,12 +1413,12 @@ __device__ __host__ type operator op (type& val, int i) { \
 }
 
 #define DECLOP_3VAR_COMP(type, op) \
-__device__ __host__ inline bool operator op (type& lhs, type& rhs) { \
+__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
   return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z; \
 }
 
 #define DECLOP_3VAR_1IN_1OUT(type, op) \
-__device__ __host__ type operator op(type &rhs) { \
+__device__ __host__ static type operator op(type &rhs) { \
   type ret; \
   ret.x = op rhs.x; \
   ret.y = op rhs.y; \
@@ -1427,7 +1427,7 @@ __device__ __host__ type operator op(type &rhs) { \
 }
 
 #define DECLOP_3VAR_1IN_BOOLOUT(type, op) \
-__device__ __host__ inline bool operator op (type &rhs) { \
+__device__ __host__ static inline bool operator op (type &rhs) { \
   return op rhs.x && op rhs.y && op rhs.z; \
 }
 
@@ -1437,7 +1437,7 @@ __device__ __host__ inline bool operator op (type &rhs) { \
 */
 
 #define DECLOP_4VAR_2IN_1OUT(type, op) \
-__device__ __host__ type operator op ( const type& lhs, const type& rhs) { \
+__device__ __host__ static type operator op ( const type& lhs, const type& rhs) { \
   type ret; \
   ret.x = lhs.x op rhs.x; \
   ret.y = lhs.y op rhs.y; \
@@ -1447,7 +1447,7 @@ __device__ __host__ type operator op ( const type& lhs, const type& rhs) { \
 }
 
 #define DECLOP_4VAR_SCALE_PRODUCT(type, type1) \
-__device__ __host__ type operator * (const type& lhs, type1 rhs) { \
+__device__ __host__ static type operator * (const type& lhs, type1 rhs) { \
   type ret; \
   ret.x = lhs.x * rhs; \
   ret.y = lhs.y * rhs; \
@@ -1456,7 +1456,7 @@ __device__ __host__ type operator * (const type& lhs, type1 rhs) { \
   return ret; \
 } \
 \
-__device__ __host__ type operator * (type1 lhs, const type& rhs) { \
+__device__ __host__ static type operator * (type1 lhs, const type& rhs) { \
   type ret; \
   ret.x = lhs * rhs.x; \
   ret.y = lhs * rhs.y; \
@@ -1466,7 +1466,7 @@ __device__ __host__ type operator * (type1 lhs, const type& rhs) { \
 }
 
 #define DECLOP_4VAR_ASSIGN(type, op) \
-__device__ __host__ inline type& operator op ( type& lhs, const type& rhs) { \
+__device__ __host__ static inline type& operator op ( type& lhs, const type& rhs) { \
   lhs.x op rhs.x; \
   lhs.y op rhs.y; \
   lhs.z op rhs.z; \
@@ -1475,7 +1475,7 @@ __device__ __host__ inline type& operator op ( type& lhs, const type& rhs) { \
 }
 
 #define DECLOP_4VAR_PREOP(type, op) \
-__device__ __host__ inline type& operator op (type& val) { \
+__device__ __host__ static inline type& operator op (type& val) { \
   op val.x; \
   op val.y; \
   op val.z; \
@@ -1484,7 +1484,7 @@ __device__ __host__ inline type& operator op (type& val) { \
 }
 
 #define DECLOP_4VAR_POSTOP(type, op) \
-__device__ __host__ type operator op (type& val, int i) { \
+__device__ __host__ static type operator op (type& val, int i) { \
   type ret; \
   ret.x = val.x; \
   ret.y = val.y; \
@@ -1498,12 +1498,12 @@ __device__ __host__ type operator op (type& val, int i) { \
 }
 
 #define DECLOP_4VAR_COMP(type, op) \
-__device__ __host__ inline bool operator op (type& lhs, type& rhs) { \
+__device__ __host__ static inline bool operator op (type& lhs, type& rhs) { \
   return lhs.x op rhs.x && lhs.y op rhs.y && lhs.z op rhs.z && lhs.w op rhs.w; \
 }
 
 #define DECLOP_4VAR_1IN_1OUT(type, op) \
-__device__ __host__ type operator op(type &rhs) { \
+__device__ __host__ static type operator op(type &rhs) { \
   type ret; \
   ret.x = op rhs.x; \
   ret.y = op rhs.y; \
@@ -1513,7 +1513,7 @@ __device__ __host__ type operator op(type &rhs) { \
 }
 
 #define DECLOP_4VAR_1IN_BOOLOUT(type, op) \
-__device__ __host__ inline bool operator op (type &rhs) { \
+__device__ __host__ static inline bool operator op (type &rhs) { \
   return op rhs.x && op rhs.y && op rhs.z && op rhs.w; \
 }
 

From 57294ce46149f831ad97f5c7ccf36b5aba7bb73c Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 11 Jan 2017 18:02:30 -0600
Subject: [PATCH 12/18] added test for vector data types

Change-Id: I0b6624886e474601cb1ef003c5f10adf399a21c9
---
 tests/src/deviceLib/hipVectorTypes.cpp | 3443 ++++++++++++++++++++++++
 1 file changed, 3443 insertions(+)
 create mode 100644 tests/src/deviceLib/hipVectorTypes.cpp

diff --git a/tests/src/deviceLib/hipVectorTypes.cpp b/tests/src/deviceLib/hipVectorTypes.cpp
new file mode 100644
index 0000000000..0e5757b30d
--- /dev/null
+++ b/tests/src/deviceLib/hipVectorTypes.cpp
@@ -0,0 +1,3443 @@
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+#include<assert.h>
+#include <hip/hip_vector_types.h>
+
+#define cmpFloat1(in, exp) \
+  if(in.x != exp) { \
+    std::cout<<"Failed at: "<<__LINE__<<" in func: "<<__func__<<" expected output: "<<exp<<" but got: "<<in.x<<std::endl; \
+    assert(-1); \
+  } \
+
+#define cmpFloat2(in, exp) \
+  if(in.x != exp || in.y != exp) { \
+    std::cout<<"Failed at: "<<__LINE__<<" in func: "<<__func__<<" expected output: "<<exp<<" but got: "<<in.x<<","<<in.y<<std::endl; \
+    assert(-1); \
+  } \
+
+#define cmpFloat3(in, exp) \
+  if(in.x != exp || in.y != exp || in.z != exp) { \
+    std::cout<<"Failed at: "<<__LINE__<<" in func: "<<__func__<<" expected output: "<<exp<<" but got: "<<in.x<<","<<in.y<<","<<in.z<<std::endl; \
+    assert(-1); \
+  } \
+
+#define cmpFloat4(in, exp) \
+  if(in.x != exp || in.y != exp || in.z != exp || in.w != exp ) { \
+    std::cout<<"Failed at: "<<__LINE__<<" in func: "<<__func__<<" expected output: "<<exp<<" but got: "<<in.x<<","<<in.y<<","<<in.z<<","<<in.w<<std::endl; \
+    assert(-1); \
+  } \
+
+bool TestUChar1() {
+  uchar1 f1, f2, f3;
+  f1.x = 1;
+  f2.x = 1;
+  f3 = f1 + f2;
+  cmpFloat1(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat1(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat1(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat1(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat1(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat1(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat1(f2, 0);
+  f1.x = 1;
+  f2.x = 2;
+  f3 = f1 << f2;
+  cmpFloat1(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat1(f2, 2);
+
+  f1.x = 2;
+  f2.x = 1;
+  f1 += f2;
+  cmpFloat1(f1, 3);
+  f1 -= f2;
+  cmpFloat1(f1, 2);
+  f1 *= f2;
+  cmpFloat1(f1, 2);
+  f1 /= f2;
+  cmpFloat1(f1, 2);
+  f1 %= f2;
+  cmpFloat1(f1, 0);
+  f1 &= f2;
+  cmpFloat1(f1, 0);
+  f1 |= f2;
+  cmpFloat1(f1, 1);
+  f1 ^= f2;
+  cmpFloat1(f1, 0);
+  f1.x = 1;
+  f1 <<= f2;
+  cmpFloat1(f1, 2);
+  f1 >>= f2;
+  cmpFloat1(f1, 1);
+
+  f1.x = 2;
+  f2 = f1++;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 2);
+  f2 = f1--;
+  cmpFloat1(f2, 3);
+  cmpFloat1(f1, 2);
+  f2 = ++f1;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 3);
+  f2 = --f1;
+  cmpFloat1(f1, 2);
+  cmpFloat1(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat1(f2, 253);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f2.x = 4;
+  f3.x = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestUChar2() {
+  uchar2 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f3 = f1 + f2;
+  cmpFloat2(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat2(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat2(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat2(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat2(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat2(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat2(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f3 = f1 << f2;
+  cmpFloat2(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat2(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f1 += f2;
+  cmpFloat2(f1, 3);
+  f1 -= f2;
+  cmpFloat2(f1, 2);
+  f1 *= f2;
+  cmpFloat2(f1, 2);
+  f1 /= f2;
+  cmpFloat2(f1, 2);
+  f1 %= f2;
+  cmpFloat2(f1, 0);
+  f1 &= f2;
+  cmpFloat2(f1, 0);
+  f1 |= f2;
+  cmpFloat2(f1, 1);
+  f1 ^= f2;
+  cmpFloat2(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1 <<= f2;
+  cmpFloat2(f1, 2);
+  f1 >>= f2;
+  cmpFloat2(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2 = f1++;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 2);
+  f2 = f1--;
+  cmpFloat2(f2, 3);
+  cmpFloat2(f1, 2);
+  f2 = ++f1;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 3);
+  f2 = --f1;
+  cmpFloat2(f1, 2);
+  cmpFloat2(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat2(f2, 253);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f3.x = 3;
+  f3.y = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestUChar3() {
+  uchar3 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f3 = f1 + f2;
+  cmpFloat3(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat3(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat3(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat3(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat3(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat3(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat3(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f3 = f1 << f2;
+  cmpFloat3(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat3(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f1 += f2;
+  cmpFloat3(f1, 3);
+  f1 -= f2;
+  cmpFloat3(f1, 2);
+  f1 *= f2;
+  cmpFloat3(f1, 2);
+  f1 /= f2;
+  cmpFloat3(f1, 2);
+  f1 %= f2;
+  cmpFloat3(f1, 0);
+  f1 &= f2;
+  cmpFloat3(f1, 0);
+  f1 |= f2;
+  cmpFloat3(f1, 1);
+  f1 ^= f2;
+  cmpFloat3(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1 <<= f2;
+  cmpFloat3(f1, 2);
+  f1 >>= f2;
+  cmpFloat3(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2 = f1++;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 2);
+  f2 = f1--;
+  cmpFloat3(f2, 3);
+  cmpFloat3(f1, 2);
+  f2 = ++f1;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 3);
+  f2 = --f1;
+  cmpFloat3(f1, 2);
+  cmpFloat3(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat3(f2, 253);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestUChar4() {
+  uchar4 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f3 = f1 + f2;
+  cmpFloat4(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat4(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat4(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat4(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat4(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat4(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat4(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f2.w = 2;
+  f3 = f1 << f2;
+  cmpFloat4(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat4(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f1 += f2;
+  cmpFloat4(f1, 3);
+  f1 -= f2;
+  cmpFloat4(f1, 2);
+  f1 *= f2;
+  cmpFloat4(f1, 2);
+  f1 /= f2;
+  cmpFloat4(f1, 2);
+  f1 %= f2;
+  cmpFloat4(f1, 0);
+  f1 &= f2;
+  cmpFloat4(f1, 0);
+  f1 |= f2;
+  cmpFloat4(f1, 1);
+  f1 ^= f2;
+  cmpFloat4(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f1 <<= f2;
+  cmpFloat4(f1, 2);
+  f1 >>= f2;
+  cmpFloat4(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2 = f1++;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 2);
+  f2 = f1--;
+  cmpFloat4(f2, 3);
+  cmpFloat4(f1, 2);
+  f2 = ++f1;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 3);
+  f2 = --f1;
+  cmpFloat4(f1, 2);
+  cmpFloat4(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat4(f2, 253);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f1.w = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f2.w = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  f3.w = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestChar1() {
+  char1 f1, f2, f3;
+  f1.x = 1;
+  f2.x = 1;
+  f3 = f1 + f2;
+  cmpFloat1(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat1(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat1(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat1(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat1(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat1(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat1(f2, 0);
+  f1.x = 1;
+  f2.x = 2;
+  f3 = f1 << f2;
+  cmpFloat1(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat1(f2, 2);
+
+  f1.x = 2;
+  f2.x = 1;
+  f1 += f2;
+  cmpFloat1(f1, 3);
+  f1 -= f2;
+  cmpFloat1(f1, 2);
+  f1 *= f2;
+  cmpFloat1(f1, 2);
+  f1 /= f2;
+  cmpFloat1(f1, 2);
+  f1 %= f2;
+  cmpFloat1(f1, 0);
+  f1 &= f2;
+  cmpFloat1(f1, 0);
+  f1 |= f2;
+  cmpFloat1(f1, 1);
+  f1 ^= f2;
+  cmpFloat1(f1, 0);
+  f1.x = 1;
+  f1 <<= f2;
+  cmpFloat1(f1, 2);
+  f1 >>= f2;
+  cmpFloat1(f1, 1);
+
+  f1.x = 2;
+  f2 = f1++;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 2);
+  f2 = f1--;
+  cmpFloat1(f2, 3);
+  cmpFloat1(f1, 2);
+  f2 = ++f1;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 3);
+  f2 = --f1;
+  cmpFloat1(f1, 2);
+  cmpFloat1(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat1(f2, (char)253);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f2.x = 4;
+  f3.x = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestChar2() {
+  char2 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f3 = f1 + f2;
+  cmpFloat2(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat2(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat2(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat2(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat2(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat2(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat2(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f3 = f1 << f2;
+  cmpFloat2(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat2(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f1 += f2;
+  cmpFloat2(f1, 3);
+  f1 -= f2;
+  cmpFloat2(f1, 2);
+  f1 *= f2;
+  cmpFloat2(f1, 2);
+  f1 /= f2;
+  cmpFloat2(f1, 2);
+  f1 %= f2;
+  cmpFloat2(f1, 0);
+  f1 &= f2;
+  cmpFloat2(f1, 0);
+  f1 |= f2;
+  cmpFloat2(f1, 1);
+  f1 ^= f2;
+  cmpFloat2(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1 <<= f2;
+  cmpFloat2(f1, 2);
+  f1 >>= f2;
+  cmpFloat2(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2 = f1++;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 2);
+  f2 = f1--;
+  cmpFloat2(f2, 3);
+  cmpFloat2(f1, 2);
+  f2 = ++f1;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 3);
+  f2 = --f1;
+  cmpFloat2(f1, 2);
+  cmpFloat2(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat2(f2, (char)253);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f3.x = 3;
+  f3.y = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestChar3() {
+  char3 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f3 = f1 + f2;
+  cmpFloat3(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat3(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat3(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat3(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat3(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat3(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat3(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f3 = f1 << f2;
+  cmpFloat3(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat3(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f1 += f2;
+  cmpFloat3(f1, 3);
+  f1 -= f2;
+  cmpFloat3(f1, 2);
+  f1 *= f2;
+  cmpFloat3(f1, 2);
+  f1 /= f2;
+  cmpFloat3(f1, 2);
+  f1 %= f2;
+  cmpFloat3(f1, 0);
+  f1 &= f2;
+  cmpFloat3(f1, 0);
+  f1 |= f2;
+  cmpFloat3(f1, 1);
+  f1 ^= f2;
+  cmpFloat3(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1 <<= f2;
+  cmpFloat3(f1, 2);
+  f1 >>= f2;
+  cmpFloat3(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2 = f1++;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 2);
+  f2 = f1--;
+  cmpFloat3(f2, 3);
+  cmpFloat3(f1, 2);
+  f2 = ++f1;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 3);
+  f2 = --f1;
+  cmpFloat3(f1, 2);
+  cmpFloat3(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat3(f2, (char)253);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestChar4() {
+  char4 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f3 = f1 + f2;
+  cmpFloat4(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat4(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat4(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat4(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat4(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat4(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat4(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f2.w = 2;
+  f3 = f1 << f2;
+  cmpFloat4(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat4(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f1 += f2;
+  cmpFloat4(f1, 3);
+  f1 -= f2;
+  cmpFloat4(f1, 2);
+  f1 *= f2;
+  cmpFloat4(f1, 2);
+  f1 /= f2;
+  cmpFloat4(f1, 2);
+  f1 %= f2;
+  cmpFloat4(f1, 0);
+  f1 &= f2;
+  cmpFloat4(f1, 0);
+  f1 |= f2;
+  cmpFloat4(f1, 1);
+  f1 ^= f2;
+  cmpFloat4(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f1 <<= f2;
+  cmpFloat4(f1, 2);
+  f1 >>= f2;
+  cmpFloat4(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2 = f1++;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 2);
+  f2 = f1--;
+  cmpFloat4(f2, 3);
+  cmpFloat4(f1, 2);
+  f2 = ++f1;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 3);
+  f2 = --f1;
+  cmpFloat4(f1, 2);
+  cmpFloat4(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat4(f2, (char)253);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f1.w = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f2.w = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  f3.w = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestUShort1() {
+  ushort1 f1, f2, f3;
+  f1.x = 1;
+  f2.x = 1;
+  f3 = f1 + f2;
+  cmpFloat1(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat1(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat1(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat1(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat1(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat1(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat1(f2, 0);
+  f1.x = 1;
+  f2.x = 2;
+  f3 = f1 << f2;
+  cmpFloat1(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat1(f2, 2);
+
+  f1.x = 2;
+  f2.x = 1;
+  f1 += f2;
+  cmpFloat1(f1, 3);
+  f1 -= f2;
+  cmpFloat1(f1, 2);
+  f1 *= f2;
+  cmpFloat1(f1, 2);
+  f1 /= f2;
+  cmpFloat1(f1, 2);
+  f1 %= f2;
+  cmpFloat1(f1, 0);
+  f1 &= f2;
+  cmpFloat1(f1, 0);
+  f1 |= f2;
+  cmpFloat1(f1, 1);
+  f1 ^= f2;
+  cmpFloat1(f1, 0);
+  f1.x = 1;
+  f1 <<= f2;
+  cmpFloat1(f1, 2);
+  f1 >>= f2;
+  cmpFloat1(f1, 1);
+
+  f1.x = 2;
+  f2 = f1++;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 2);
+  f2 = f1--;
+  cmpFloat1(f2, 3);
+  cmpFloat1(f1, 2);
+  f2 = ++f1;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 3);
+  f2 = --f1;
+  cmpFloat1(f1, 2);
+  cmpFloat1(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat1(f2, (unsigned short)65533);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f2.x = 4;
+  f3.x = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestUShort2() {
+  ushort2 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f3 = f1 + f2;
+  cmpFloat2(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat2(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat2(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat2(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat2(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat2(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat2(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f3 = f1 << f2;
+  cmpFloat2(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat2(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f1 += f2;
+  cmpFloat2(f1, 3);
+  f1 -= f2;
+  cmpFloat2(f1, 2);
+  f1 *= f2;
+  cmpFloat2(f1, 2);
+  f1 /= f2;
+  cmpFloat2(f1, 2);
+  f1 %= f2;
+  cmpFloat2(f1, 0);
+  f1 &= f2;
+  cmpFloat2(f1, 0);
+  f1 |= f2;
+  cmpFloat2(f1, 1);
+  f1 ^= f2;
+  cmpFloat2(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1 <<= f2;
+  cmpFloat2(f1, 2);
+  f1 >>= f2;
+  cmpFloat2(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2 = f1++;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 2);
+  f2 = f1--;
+  cmpFloat2(f2, 3);
+  cmpFloat2(f1, 2);
+  f2 = ++f1;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 3);
+  f2 = --f1;
+  cmpFloat2(f1, 2);
+  cmpFloat2(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat2(f2, (unsigned short)65533);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f3.x = 3;
+  f3.y = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestUShort3() {
+  ushort3 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f3 = f1 + f2;
+  cmpFloat3(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat3(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat3(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat3(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat3(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat3(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat3(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f3 = f1 << f2;
+  cmpFloat3(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat3(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f1 += f2;
+  cmpFloat3(f1, 3);
+  f1 -= f2;
+  cmpFloat3(f1, 2);
+  f1 *= f2;
+  cmpFloat3(f1, 2);
+  f1 /= f2;
+  cmpFloat3(f1, 2);
+  f1 %= f2;
+  cmpFloat3(f1, 0);
+  f1 &= f2;
+  cmpFloat3(f1, 0);
+  f1 |= f2;
+  cmpFloat3(f1, 1);
+  f1 ^= f2;
+  cmpFloat3(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1 <<= f2;
+  cmpFloat3(f1, 2);
+  f1 >>= f2;
+  cmpFloat3(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2 = f1++;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 2);
+  f2 = f1--;
+  cmpFloat3(f2, 3);
+  cmpFloat3(f1, 2);
+  f2 = ++f1;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 3);
+  f2 = --f1;
+  cmpFloat3(f1, 2);
+  cmpFloat3(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat3(f2, (unsigned short)65533);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestUShort4() {
+  ushort4 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f3 = f1 + f2;
+  cmpFloat4(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat4(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat4(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat4(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat4(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat4(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat4(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f2.w = 2;
+  f3 = f1 << f2;
+  cmpFloat4(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat4(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f1 += f2;
+  cmpFloat4(f1, 3);
+  f1 -= f2;
+  cmpFloat4(f1, 2);
+  f1 *= f2;
+  cmpFloat4(f1, 2);
+  f1 /= f2;
+  cmpFloat4(f1, 2);
+  f1 %= f2;
+  cmpFloat4(f1, 0);
+  f1 &= f2;
+  cmpFloat4(f1, 0);
+  f1 |= f2;
+  cmpFloat4(f1, 1);
+  f1 ^= f2;
+  cmpFloat4(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f1 <<= f2;
+  cmpFloat4(f1, 2);
+  f1 >>= f2;
+  cmpFloat4(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2 = f1++;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 2);
+  f2 = f1--;
+  cmpFloat4(f2, 3);
+  cmpFloat4(f1, 2);
+  f2 = ++f1;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 3);
+  f2 = --f1;
+  cmpFloat4(f1, 2);
+  cmpFloat4(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat4(f2, (unsigned short)65533);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f1.w = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f2.w = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  f3.w = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestShort1() {
+  short1 f1, f2, f3;
+  f1.x = 1;
+  f2.x = 1;
+  f3 = f1 + f2;
+  cmpFloat1(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat1(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat1(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat1(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat1(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat1(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat1(f2, 0);
+  f1.x = 1;
+  f2.x = 2;
+  f3 = f1 << f2;
+  cmpFloat1(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat1(f2, 2);
+
+  f1.x = 2;
+  f2.x = 1;
+  f1 += f2;
+  cmpFloat1(f1, 3);
+  f1 -= f2;
+  cmpFloat1(f1, 2);
+  f1 *= f2;
+  cmpFloat1(f1, 2);
+  f1 /= f2;
+  cmpFloat1(f1, 2);
+  f1 %= f2;
+  cmpFloat1(f1, 0);
+  f1 &= f2;
+  cmpFloat1(f1, 0);
+  f1 |= f2;
+  cmpFloat1(f1, 1);
+  f1 ^= f2;
+  cmpFloat1(f1, 0);
+  f1.x = 1;
+  f1 <<= f2;
+  cmpFloat1(f1, 2);
+  f1 >>= f2;
+  cmpFloat1(f1, 1);
+
+  f1.x = 2;
+  f2 = f1++;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 2);
+  f2 = f1--;
+  cmpFloat1(f2, 3);
+  cmpFloat1(f1, 2);
+  f2 = ++f1;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 3);
+  f2 = --f1;
+  cmpFloat1(f1, 2);
+  cmpFloat1(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat1(f2, (signed short)65533);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f2.x = 4;
+  f3.x = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestShort2() {
+  short2 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f3 = f1 + f2;
+  cmpFloat2(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat2(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat2(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat2(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat2(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat2(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat2(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f3 = f1 << f2;
+  cmpFloat2(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat2(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f1 += f2;
+  cmpFloat2(f1, 3);
+  f1 -= f2;
+  cmpFloat2(f1, 2);
+  f1 *= f2;
+  cmpFloat2(f1, 2);
+  f1 /= f2;
+  cmpFloat2(f1, 2);
+  f1 %= f2;
+  cmpFloat2(f1, 0);
+  f1 &= f2;
+  cmpFloat2(f1, 0);
+  f1 |= f2;
+  cmpFloat2(f1, 1);
+  f1 ^= f2;
+  cmpFloat2(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1 <<= f2;
+  cmpFloat2(f1, 2);
+  f1 >>= f2;
+  cmpFloat2(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2 = f1++;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 2);
+  f2 = f1--;
+  cmpFloat2(f2, 3);
+  cmpFloat2(f1, 2);
+  f2 = ++f1;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 3);
+  f2 = --f1;
+  cmpFloat2(f1, 2);
+  cmpFloat2(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat2(f2, (signed short)65533);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f3.x = 3;
+  f3.y = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestShort3() {
+  short3 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f3 = f1 + f2;
+  cmpFloat3(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat3(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat3(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat3(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat3(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat3(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat3(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f3 = f1 << f2;
+  cmpFloat3(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat3(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f1 += f2;
+  cmpFloat3(f1, 3);
+  f1 -= f2;
+  cmpFloat3(f1, 2);
+  f1 *= f2;
+  cmpFloat3(f1, 2);
+  f1 /= f2;
+  cmpFloat3(f1, 2);
+  f1 %= f2;
+  cmpFloat3(f1, 0);
+  f1 &= f2;
+  cmpFloat3(f1, 0);
+  f1 |= f2;
+  cmpFloat3(f1, 1);
+  f1 ^= f2;
+  cmpFloat3(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1 <<= f2;
+  cmpFloat3(f1, 2);
+  f1 >>= f2;
+  cmpFloat3(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2 = f1++;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 2);
+  f2 = f1--;
+  cmpFloat3(f2, 3);
+  cmpFloat3(f1, 2);
+  f2 = ++f1;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 3);
+  f2 = --f1;
+  cmpFloat3(f1, 2);
+  cmpFloat3(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat3(f2, (signed short)65533);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestShort4() {
+  short4 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f3 = f1 + f2;
+  cmpFloat4(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat4(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat4(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat4(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat4(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat4(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat4(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f2.w = 2;
+  f3 = f1 << f2;
+  cmpFloat4(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat4(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f1 += f2;
+  cmpFloat4(f1, 3);
+  f1 -= f2;
+  cmpFloat4(f1, 2);
+  f1 *= f2;
+  cmpFloat4(f1, 2);
+  f1 /= f2;
+  cmpFloat4(f1, 2);
+  f1 %= f2;
+  cmpFloat4(f1, 0);
+  f1 &= f2;
+  cmpFloat4(f1, 0);
+  f1 |= f2;
+  cmpFloat4(f1, 1);
+  f1 ^= f2;
+  cmpFloat4(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f1 <<= f2;
+  cmpFloat4(f1, 2);
+  f1 >>= f2;
+  cmpFloat4(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2 = f1++;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 2);
+  f2 = f1--;
+  cmpFloat4(f2, 3);
+  cmpFloat4(f1, 2);
+  f2 = ++f1;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 3);
+  f2 = --f1;
+  cmpFloat4(f1, 2);
+  cmpFloat4(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat4(f2, (signed short)65533);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f1.w = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f2.w = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  f3.w = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+
+bool TestUInt1() {
+  uint1 f1, f2, f3;
+  f1.x = 1;
+  f2.x = 1;
+  f3 = f1 + f2;
+  cmpFloat1(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat1(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat1(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat1(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat1(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat1(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat1(f2, 0);
+  f1.x = 1;
+  f2.x = 2;
+  f3 = f1 << f2;
+  cmpFloat1(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat1(f2, 2);
+
+  f1.x = 2;
+  f2.x = 1;
+  f1 += f2;
+  cmpFloat1(f1, 3);
+  f1 -= f2;
+  cmpFloat1(f1, 2);
+  f1 *= f2;
+  cmpFloat1(f1, 2);
+  f1 /= f2;
+  cmpFloat1(f1, 2);
+  f1 %= f2;
+  cmpFloat1(f1, 0);
+  f1 &= f2;
+  cmpFloat1(f1, 0);
+  f1 |= f2;
+  cmpFloat1(f1, 1);
+  f1 ^= f2;
+  cmpFloat1(f1, 0);
+  f1.x = 1;
+  f1 <<= f2;
+  cmpFloat1(f1, 2);
+  f1 >>= f2;
+  cmpFloat1(f1, 1);
+
+  f1.x = 2;
+  f2 = f1++;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 2);
+  f2 = f1--;
+  cmpFloat1(f2, 3);
+  cmpFloat1(f1, 2);
+  f2 = ++f1;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 3);
+  f2 = --f1;
+  cmpFloat1(f1, 2);
+  cmpFloat1(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat1(f2, (unsigned int)4294967293);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f2.x = 4;
+  f3.x = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestUInt2() {
+  uint2 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f3 = f1 + f2;
+  cmpFloat2(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat2(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat2(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat2(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat2(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat2(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat2(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f3 = f1 << f2;
+  cmpFloat2(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat2(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f1 += f2;
+  cmpFloat2(f1, 3);
+  f1 -= f2;
+  cmpFloat2(f1, 2);
+  f1 *= f2;
+  cmpFloat2(f1, 2);
+  f1 /= f2;
+  cmpFloat2(f1, 2);
+  f1 %= f2;
+  cmpFloat2(f1, 0);
+  f1 &= f2;
+  cmpFloat2(f1, 0);
+  f1 |= f2;
+  cmpFloat2(f1, 1);
+  f1 ^= f2;
+  cmpFloat2(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1 <<= f2;
+  cmpFloat2(f1, 2);
+  f1 >>= f2;
+  cmpFloat2(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2 = f1++;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 2);
+  f2 = f1--;
+  cmpFloat2(f2, 3);
+  cmpFloat2(f1, 2);
+  f2 = ++f1;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 3);
+  f2 = --f1;
+  cmpFloat2(f1, 2);
+  cmpFloat2(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat2(f2, (unsigned int)4294967293);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f3.x = 3;
+  f3.y = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestUInt3() {
+  uint3 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f3 = f1 + f2;
+  cmpFloat3(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat3(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat3(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat3(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat3(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat3(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat3(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f3 = f1 << f2;
+  cmpFloat3(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat3(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f1 += f2;
+  cmpFloat3(f1, 3);
+  f1 -= f2;
+  cmpFloat3(f1, 2);
+  f1 *= f2;
+  cmpFloat3(f1, 2);
+  f1 /= f2;
+  cmpFloat3(f1, 2);
+  f1 %= f2;
+  cmpFloat3(f1, 0);
+  f1 &= f2;
+  cmpFloat3(f1, 0);
+  f1 |= f2;
+  cmpFloat3(f1, 1);
+  f1 ^= f2;
+  cmpFloat3(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1 <<= f2;
+  cmpFloat3(f1, 2);
+  f1 >>= f2;
+  cmpFloat3(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2 = f1++;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 2);
+  f2 = f1--;
+  cmpFloat3(f2, 3);
+  cmpFloat3(f1, 2);
+  f2 = ++f1;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 3);
+  f2 = --f1;
+  cmpFloat3(f1, 2);
+  cmpFloat3(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat3(f2, (unsigned int)4294967293);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestUInt4() {
+  uint4 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f3 = f1 + f2;
+  cmpFloat4(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat4(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat4(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat4(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat4(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat4(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat4(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f2.w = 2;
+  f3 = f1 << f2;
+  cmpFloat4(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat4(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f1 += f2;
+  cmpFloat4(f1, 3);
+  f1 -= f2;
+  cmpFloat4(f1, 2);
+  f1 *= f2;
+  cmpFloat4(f1, 2);
+  f1 /= f2;
+  cmpFloat4(f1, 2);
+  f1 %= f2;
+  cmpFloat4(f1, 0);
+  f1 &= f2;
+  cmpFloat4(f1, 0);
+  f1 |= f2;
+  cmpFloat4(f1, 1);
+  f1 ^= f2;
+  cmpFloat4(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f1 <<= f2;
+  cmpFloat4(f1, 2);
+  f1 >>= f2;
+  cmpFloat4(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2 = f1++;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 2);
+  f2 = f1--;
+  cmpFloat4(f2, 3);
+  cmpFloat4(f1, 2);
+  f2 = ++f1;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 3);
+  f2 = --f1;
+  cmpFloat4(f1, 2);
+  cmpFloat4(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat4(f2, (unsigned int)4294967293);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f1.w = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f2.w = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  f3.w = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestInt1() {
+  int1 f1, f2, f3;
+  f1.x = 1;
+  f2.x = 1;
+  f3 = f1 + f2;
+  cmpFloat1(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat1(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat1(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat1(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat1(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat1(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat1(f2, 0);
+  f1.x = 1;
+  f2.x = 2;
+  f3 = f1 << f2;
+  cmpFloat1(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat1(f2, 2);
+
+  f1.x = 2;
+  f2.x = 1;
+  f1 += f2;
+  cmpFloat1(f1, 3);
+  f1 -= f2;
+  cmpFloat1(f1, 2);
+  f1 *= f2;
+  cmpFloat1(f1, 2);
+  f1 /= f2;
+  cmpFloat1(f1, 2);
+  f1 %= f2;
+  cmpFloat1(f1, 0);
+  f1 &= f2;
+  cmpFloat1(f1, 0);
+  f1 |= f2;
+  cmpFloat1(f1, 1);
+  f1 ^= f2;
+  cmpFloat1(f1, 0);
+  f1.x = 1;
+  f1 <<= f2;
+  cmpFloat1(f1, 2);
+  f1 >>= f2;
+  cmpFloat1(f1, 1);
+
+  f1.x = 2;
+  f2 = f1++;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 2);
+  f2 = f1--;
+  cmpFloat1(f2, 3);
+  cmpFloat1(f1, 2);
+  f2 = ++f1;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 3);
+  f2 = --f1;
+  cmpFloat1(f1, 2);
+  cmpFloat1(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat1(f2, (signed int)4294967293);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f2.x = 4;
+  f3.x = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestInt2() {
+  int2 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f3 = f1 + f2;
+  cmpFloat2(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat2(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat2(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat2(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat2(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat2(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat2(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f3 = f1 << f2;
+  cmpFloat2(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat2(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f1 += f2;
+  cmpFloat2(f1, 3);
+  f1 -= f2;
+  cmpFloat2(f1, 2);
+  f1 *= f2;
+  cmpFloat2(f1, 2);
+  f1 /= f2;
+  cmpFloat2(f1, 2);
+  f1 %= f2;
+  cmpFloat2(f1, 0);
+  f1 &= f2;
+  cmpFloat2(f1, 0);
+  f1 |= f2;
+  cmpFloat2(f1, 1);
+  f1 ^= f2;
+  cmpFloat2(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1 <<= f2;
+  cmpFloat2(f1, 2);
+  f1 >>= f2;
+  cmpFloat2(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2 = f1++;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 2);
+  f2 = f1--;
+  cmpFloat2(f2, 3);
+  cmpFloat2(f1, 2);
+  f2 = ++f1;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 3);
+  f2 = --f1;
+  cmpFloat2(f1, 2);
+  cmpFloat2(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat2(f2, (signed int)4294967293);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f3.x = 3;
+  f3.y = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestInt3() {
+  int3 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f3 = f1 + f2;
+  cmpFloat3(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat3(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat3(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat3(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat3(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat3(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat3(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f3 = f1 << f2;
+  cmpFloat3(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat3(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f1 += f2;
+  cmpFloat3(f1, 3);
+  f1 -= f2;
+  cmpFloat3(f1, 2);
+  f1 *= f2;
+  cmpFloat3(f1, 2);
+  f1 /= f2;
+  cmpFloat3(f1, 2);
+  f1 %= f2;
+  cmpFloat3(f1, 0);
+  f1 &= f2;
+  cmpFloat3(f1, 0);
+  f1 |= f2;
+  cmpFloat3(f1, 1);
+  f1 ^= f2;
+  cmpFloat3(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1 <<= f2;
+  cmpFloat3(f1, 2);
+  f1 >>= f2;
+  cmpFloat3(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2 = f1++;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 2);
+  f2 = f1--;
+  cmpFloat3(f2, 3);
+  cmpFloat3(f1, 2);
+  f2 = ++f1;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 3);
+  f2 = --f1;
+  cmpFloat3(f1, 2);
+  cmpFloat3(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat3(f2, (signed int)4294967293);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestInt4() {
+  int4 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f3 = f1 + f2;
+  cmpFloat4(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat4(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat4(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat4(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat4(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat4(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat4(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f2.w = 2;
+  f3 = f1 << f2;
+  cmpFloat4(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat4(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f1 += f2;
+  cmpFloat4(f1, 3);
+  f1 -= f2;
+  cmpFloat4(f1, 2);
+  f1 *= f2;
+  cmpFloat4(f1, 2);
+  f1 /= f2;
+  cmpFloat4(f1, 2);
+  f1 %= f2;
+  cmpFloat4(f1, 0);
+  f1 &= f2;
+  cmpFloat4(f1, 0);
+  f1 |= f2;
+  cmpFloat4(f1, 1);
+  f1 ^= f2;
+  cmpFloat4(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f1 <<= f2;
+  cmpFloat4(f1, 2);
+  f1 >>= f2;
+  cmpFloat4(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2 = f1++;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 2);
+  f2 = f1--;
+  cmpFloat4(f2, 3);
+  cmpFloat4(f1, 2);
+  f2 = ++f1;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 3);
+  f2 = --f1;
+  cmpFloat4(f1, 2);
+  cmpFloat4(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat4(f2, (signed int)4294967293);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f1.w = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f2.w = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  f3.w = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestULong1() {
+  ulong1 f1, f2, f3;
+  f1.x = 1;
+  f2.x = 1;
+  f3 = f1 + f2;
+  cmpFloat1(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat1(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat1(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat1(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat1(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat1(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat1(f2, 0);
+  f1.x = 1;
+  f2.x = 2;
+  f3 = f1 << f2;
+  cmpFloat1(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat1(f2, 2);
+
+  f1.x = 2;
+  f2.x = 1;
+  f1 += f2;
+  cmpFloat1(f1, 3);
+  f1 -= f2;
+  cmpFloat1(f1, 2);
+  f1 *= f2;
+  cmpFloat1(f1, 2);
+  f1 /= f2;
+  cmpFloat1(f1, 2);
+  f1 %= f2;
+  cmpFloat1(f1, 0);
+  f1 &= f2;
+  cmpFloat1(f1, 0);
+  f1 |= f2;
+  cmpFloat1(f1, 1);
+  f1 ^= f2;
+  cmpFloat1(f1, 0);
+  f1.x = 1;
+  f1 <<= f2;
+  cmpFloat1(f1, 2);
+  f1 >>= f2;
+  cmpFloat1(f1, 1);
+
+  f1.x = 2;
+  f2 = f1++;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 2);
+  f2 = f1--;
+  cmpFloat1(f2, 3);
+  cmpFloat1(f1, 2);
+  f2 = ++f1;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 3);
+  f2 = --f1;
+  cmpFloat1(f1, 2);
+  cmpFloat1(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat1(f2, 18446744073709551613UL);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f2.x = 4;
+  f3.x = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestULong2() {
+  ulong2 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f3 = f1 + f2;
+  cmpFloat2(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat2(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat2(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat2(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat2(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat2(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat2(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f3 = f1 << f2;
+  cmpFloat2(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat2(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f1 += f2;
+  cmpFloat2(f1, 3);
+  f1 -= f2;
+  cmpFloat2(f1, 2);
+  f1 *= f2;
+  cmpFloat2(f1, 2);
+  f1 /= f2;
+  cmpFloat2(f1, 2);
+  f1 %= f2;
+  cmpFloat2(f1, 0);
+  f1 &= f2;
+  cmpFloat2(f1, 0);
+  f1 |= f2;
+  cmpFloat2(f1, 1);
+  f1 ^= f2;
+  cmpFloat2(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1 <<= f2;
+  cmpFloat2(f1, 2);
+  f1 >>= f2;
+  cmpFloat2(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2 = f1++;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 2);
+  f2 = f1--;
+  cmpFloat2(f2, 3);
+  cmpFloat2(f1, 2);
+  f2 = ++f1;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 3);
+  f2 = --f1;
+  cmpFloat2(f1, 2);
+  cmpFloat2(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat2(f2, 18446744073709551613UL);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f3.x = 3;
+  f3.y = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestULong3() {
+  ulong3 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f3 = f1 + f2;
+  cmpFloat3(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat3(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat3(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat3(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat3(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat3(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat3(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f3 = f1 << f2;
+  cmpFloat3(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat3(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f1 += f2;
+  cmpFloat3(f1, 3);
+  f1 -= f2;
+  cmpFloat3(f1, 2);
+  f1 *= f2;
+  cmpFloat3(f1, 2);
+  f1 /= f2;
+  cmpFloat3(f1, 2);
+  f1 %= f2;
+  cmpFloat3(f1, 0);
+  f1 &= f2;
+  cmpFloat3(f1, 0);
+  f1 |= f2;
+  cmpFloat3(f1, 1);
+  f1 ^= f2;
+  cmpFloat3(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1 <<= f2;
+  cmpFloat3(f1, 2);
+  f1 >>= f2;
+  cmpFloat3(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2 = f1++;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 2);
+  f2 = f1--;
+  cmpFloat3(f2, 3);
+  cmpFloat3(f1, 2);
+  f2 = ++f1;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 3);
+  f2 = --f1;
+  cmpFloat3(f1, 2);
+  cmpFloat3(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat3(f2, 18446744073709551613UL);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestULong4() {
+  ulong4 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f3 = f1 + f2;
+  cmpFloat4(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat4(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat4(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat4(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat4(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat4(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat4(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f2.w = 2;
+  f3 = f1 << f2;
+  cmpFloat4(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat4(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f1 += f2;
+  cmpFloat4(f1, 3);
+  f1 -= f2;
+  cmpFloat4(f1, 2);
+  f1 *= f2;
+  cmpFloat4(f1, 2);
+  f1 /= f2;
+  cmpFloat4(f1, 2);
+  f1 %= f2;
+  cmpFloat4(f1, 0);
+  f1 &= f2;
+  cmpFloat4(f1, 0);
+  f1 |= f2;
+  cmpFloat4(f1, 1);
+  f1 ^= f2;
+  cmpFloat4(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f1 <<= f2;
+  cmpFloat4(f1, 2);
+  f1 >>= f2;
+  cmpFloat4(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2 = f1++;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 2);
+  f2 = f1--;
+  cmpFloat4(f2, 3);
+  cmpFloat4(f1, 2);
+  f2 = ++f1;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 3);
+  f2 = --f1;
+  cmpFloat4(f1, 2);
+  cmpFloat4(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat4(f2, 18446744073709551613UL);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f1.w = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f2.w = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  f3.w = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestLong1() {
+  long1 f1, f2, f3;
+  f1.x = 1;
+  f2.x = 1;
+  f3 = f1 + f2;
+  cmpFloat1(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat1(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat1(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat1(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat1(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat1(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat1(f2, 0);
+  f1.x = 1;
+  f2.x = 2;
+  f3 = f1 << f2;
+  cmpFloat1(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat1(f2, 2);
+
+  f1.x = 2;
+  f2.x = 1;
+  f1 += f2;
+  cmpFloat1(f1, 3);
+  f1 -= f2;
+  cmpFloat1(f1, 2);
+  f1 *= f2;
+  cmpFloat1(f1, 2);
+  f1 /= f2;
+  cmpFloat1(f1, 2);
+  f1 %= f2;
+  cmpFloat1(f1, 0);
+  f1 &= f2;
+  cmpFloat1(f1, 0);
+  f1 |= f2;
+  cmpFloat1(f1, 1);
+  f1 ^= f2;
+  cmpFloat1(f1, 0);
+  f1.x = 1;
+  f1 <<= f2;
+  cmpFloat1(f1, 2);
+  f1 >>= f2;
+  cmpFloat1(f1, 1);
+
+  f1.x = 2;
+  f2 = f1++;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 2);
+  f2 = f1--;
+  cmpFloat1(f2, 3);
+  cmpFloat1(f1, 2);
+  f2 = ++f1;
+  cmpFloat1(f1, 3);
+  cmpFloat1(f2, 3);
+  f2 = --f1;
+  cmpFloat1(f1, 2);
+  cmpFloat1(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat1(f2, -3);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f2.x = 4;
+  f3.x = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestLong2() {
+  long2 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f3 = f1 + f2;
+  cmpFloat2(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat2(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat2(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat2(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat2(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat2(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat2(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f3 = f1 << f2;
+  cmpFloat2(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat2(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f1 += f2;
+  cmpFloat2(f1, 3);
+  f1 -= f2;
+  cmpFloat2(f1, 2);
+  f1 *= f2;
+  cmpFloat2(f1, 2);
+  f1 /= f2;
+  cmpFloat2(f1, 2);
+  f1 %= f2;
+  cmpFloat2(f1, 0);
+  f1 &= f2;
+  cmpFloat2(f1, 0);
+  f1 |= f2;
+  cmpFloat2(f1, 1);
+  f1 ^= f2;
+  cmpFloat2(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1 <<= f2;
+  cmpFloat2(f1, 2);
+  f1 >>= f2;
+  cmpFloat2(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f2 = f1++;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 2);
+  f2 = f1--;
+  cmpFloat2(f2, 3);
+  cmpFloat2(f1, 2);
+  f2 = ++f1;
+  cmpFloat2(f1, 3);
+  cmpFloat2(f2, 3);
+  f2 = --f1;
+  cmpFloat2(f1, 2);
+  cmpFloat2(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat2(f2, -3);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f3.x = 3;
+  f3.y = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestLong3() {
+  long3 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f3 = f1 + f2;
+  cmpFloat3(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat3(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat3(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat3(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat3(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat3(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat3(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f3 = f1 << f2;
+  cmpFloat3(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat3(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f1 += f2;
+  cmpFloat3(f1, 3);
+  f1 -= f2;
+  cmpFloat3(f1, 2);
+  f1 *= f2;
+  cmpFloat3(f1, 2);
+  f1 /= f2;
+  cmpFloat3(f1, 2);
+  f1 %= f2;
+  cmpFloat3(f1, 0);
+  f1 &= f2;
+  cmpFloat3(f1, 0);
+  f1 |= f2;
+  cmpFloat3(f1, 1);
+  f1 ^= f2;
+  cmpFloat3(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1 <<= f2;
+  cmpFloat3(f1, 2);
+  f1 >>= f2;
+  cmpFloat3(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f2 = f1++;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 2);
+  f2 = f1--;
+  cmpFloat3(f2, 3);
+  cmpFloat3(f1, 2);
+  f2 = ++f1;
+  cmpFloat3(f1, 3);
+  cmpFloat3(f2, 3);
+  f2 = --f1;
+  cmpFloat3(f1, 2);
+  cmpFloat3(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat3(f2, -3);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+bool TestLong4() {
+  long4 f1, f2, f3;
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f3 = f1 + f2;
+  cmpFloat4(f3, 2);
+  f2 = f3 - f1;
+  cmpFloat4(f2, 1);
+  f1 = f2 * f3;
+  cmpFloat4(f1, 2);
+  f2 = f1 / f3;
+  cmpFloat4(f2, 2/2);
+  f3 = f1 % f2;
+  cmpFloat4(f3, 0);
+  f1 = f3 & f2;
+  cmpFloat4(f1, 0);
+  f2 = f1 ^ f3;
+  cmpFloat4(f2, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f2.x = 2;
+  f2.y = 2;
+  f2.z = 2;
+  f2.w = 2;
+  f3 = f1 << f2;
+  cmpFloat4(f3, 4);
+  f2 = f3 >> f1;
+  cmpFloat4(f2, 2);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2.x = 1;
+  f2.y = 1;
+  f2.z = 1;
+  f2.w = 1;
+  f1 += f2;
+  cmpFloat4(f1, 3);
+  f1 -= f2;
+  cmpFloat4(f1, 2);
+  f1 *= f2;
+  cmpFloat4(f1, 2);
+  f1 /= f2;
+  cmpFloat4(f1, 2);
+  f1 %= f2;
+  cmpFloat4(f1, 0);
+  f1 &= f2;
+  cmpFloat4(f1, 0);
+  f1 |= f2;
+  cmpFloat4(f1, 1);
+  f1 ^= f2;
+  cmpFloat4(f1, 0);
+  f1.x = 1;
+  f1.y = 1;
+  f1.z = 1;
+  f1.w = 1;
+  f1 <<= f2;
+  cmpFloat4(f1, 2);
+  f1 >>= f2;
+  cmpFloat4(f1, 1);
+
+  f1.x = 2;
+  f1.y = 2;
+  f1.z = 2;
+  f1.w = 2;
+  f2 = f1++;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 2);
+  f2 = f1--;
+  cmpFloat4(f2, 3);
+  cmpFloat4(f1, 2);
+  f2 = ++f1;
+  cmpFloat4(f1, 3);
+  cmpFloat4(f2, 3);
+  f2 = --f1;
+  cmpFloat4(f1, 2);
+  cmpFloat4(f2, 2);
+
+  f2 = ~f1;
+  cmpFloat4(f2, -3);
+  assert(!f1 == false);
+
+  f1.x = 3;
+  f1.y = 3;
+  f1.z = 3;
+  f1.w = 3;
+  f2.x = 4;
+  f2.y = 4;
+  f2.z = 4;
+  f2.w = 4;
+  f3.x = 3;
+  f3.y = 3;
+  f3.z = 3;
+  f3.w = 3;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  assert((f1 && f2) == true);
+  assert((f1 || f2) == true);
+  return true;
+}
+
+
+bool TestFloat1() {
+  float1 f1, f2, f3;
+//  float1 f4(1);
+//  cmpFloat1(f4, 1.0f);
+//  float1 f5(2.0f);
+//  cmpFloat1(f5, 2.0f);
+  f1.x = 1.0f;
+  f2.x = 1.0f;
+  f3 = f1 + f2;
+  cmpFloat1(f3, 2.0f);
+  f2 = f3 - f1;
+  cmpFloat1(f2, 1.0f);
+  f1 = f2 * f3;
+  cmpFloat1(f1, 2.0f);
+  f2 = f1 / f3;
+  cmpFloat1(f2, 2.0f/2.0f);
+  f1 += f2;
+  cmpFloat1(f1, 3.0f);
+  f1 -= f2;
+  cmpFloat1(f1, 2.0f);
+  f1 *= f2;
+  cmpFloat1(f1, 2.0f);
+  f1 /= f2;
+  cmpFloat1(f1, 2.0f);
+  f2 = f1++;
+  cmpFloat1(f1, 3.0f);
+  cmpFloat1(f2, 2.0f);
+  f2 = f1--;
+  cmpFloat1(f2, 3.0f);
+  cmpFloat1(f1, 2.0f);
+  f2 = ++f1;
+  cmpFloat1(f1, 3.0f);
+  cmpFloat1(f2, 3.0f);
+  f2 = --f1;
+  cmpFloat1(f1, 2.0f);
+  cmpFloat1(f1, 2.0f);
+
+  f1.x = 3.0f;
+  f2.x = 4.0f;
+  f3.x = 3.0f;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  return true;
+}
+
+bool TestFloat2() {
+  float2 f1, f2, f3;
+  f1.x = 1.0f;
+  f1.y = 1.0f;
+  f2.x = 1.0f;
+  f2.y = 1.0f;
+  f3 = f1 + f2;
+  cmpFloat2(f3, 2.0f);
+  f2 = f3 - f1;
+  cmpFloat2(f2, 1.0f);
+  f1 = f2 * f3;
+  cmpFloat2(f1, 2.0f);
+  f2 = f1 / f3;
+  cmpFloat2(f2, 2.0f/2.0f);
+  f1 += f2;
+  cmpFloat2(f1, 3.0f);
+  f1 -= f2;
+  cmpFloat2(f1, 2.0f);
+  f1 *= f2;
+  cmpFloat2(f1, 2.0f);
+  f1 /= f2;
+  cmpFloat2(f1, 2.0f);
+
+  f2 = f1++;
+  cmpFloat2(f1, 3.0f);
+  cmpFloat2(f2, 2.0f);
+  f2 = f1--;
+  cmpFloat2(f2, 3.0f);
+  cmpFloat2(f1, 2.0f);
+  f2 = ++f1;
+  cmpFloat2(f1, 3.0f);
+  cmpFloat2(f2, 3.0f);
+  f2 = --f1;
+  cmpFloat2(f1, 2.0f);
+  cmpFloat2(f1, 2.0f);
+
+  f1.x = 3.0f;
+  f1.y = 3.0f;
+  f2.x = 4.0f;
+  f2.y = 4.0f;
+  f3.x = 3.0f;
+  f3.y = 3.0f;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+
+  return true;
+}
+
+bool TestFloat3() {
+  float3 f1, f2, f3;
+  f1.x = 1.0f;
+  f1.y = 1.0f;
+  f1.z = 1.0f;
+  f2.x = 1.0f;
+  f2.y = 1.0f;
+  f2.z = 1.0f;
+  f3 = f1 + f2;
+  cmpFloat3(f3, 2.0f);
+  f2 = f3 - f1;
+  cmpFloat3(f2, 1.0f);
+  f1 = f2 * f3;
+  cmpFloat3(f1, 2.0f);
+  f2 = f1 / f3;
+  cmpFloat3(f2, 2.0f/2.0f);
+  f1 += f2;
+  cmpFloat3(f1, 3.0f);
+  f1 -= f2;
+  cmpFloat3(f1, 2.0f);
+  f1 *= f2;
+  cmpFloat3(f1, 2.0f);
+  f1 /= f2;
+  f2 = f1++;
+  cmpFloat3(f1, 3.0f);
+  cmpFloat3(f2, 2.0f);
+  f2 = f1--;
+  cmpFloat3(f2, 3.0f);
+  cmpFloat3(f1, 2.0f);
+  f2 = ++f1;
+  cmpFloat3(f1, 3.0f);
+  cmpFloat3(f2, 3.0f);
+  f2 = --f1;
+  cmpFloat3(f1, 2.0f);
+  cmpFloat3(f1, 2.0f);
+
+  f1.x = 3.0f;
+  f1.y = 3.0f;
+  f1.z = 3.0f;
+  f2.x = 4.0f;
+  f2.y = 4.0f;
+  f2.z = 4.0f;
+  f3.x = 3.0f;
+  f3.y = 3.0f;
+  f3.z = 3.0f;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+
+  return true;
+}
+
+
+bool TestFloat4() {
+  float4 f1, f2, f3;
+  f1.x = 1.0f;
+  f1.y = 1.0f;
+  f1.z = 1.0f;
+  f1.w = 1.0f;
+  f2.x = 1.0f;
+  f2.y = 1.0f;
+  f2.z = 1.0f;
+  f2.w = 1.0f;
+  f3 = f1 + f2;
+  cmpFloat4(f3, 2.0f);
+  f2 = f3 - f1;
+  cmpFloat4(f2, 1.0f);
+  f1 = f2 * f3;
+  cmpFloat4(f1, 2.0f);
+  f2 = f1 / f3;
+  cmpFloat4(f2, 2.0f/2.0f);
+  f1 += f2;
+  cmpFloat4(f1, 3.0f);
+  f1 -= f2;
+  cmpFloat4(f1, 2.0f);
+  f1 *= f2;
+  cmpFloat4(f1, 2.0f);
+  f1 /= f2;
+  f2 = f1++;
+  cmpFloat4(f1, 3.0f);
+  cmpFloat4(f2, 2.0f);
+  f2 = f1--;
+  cmpFloat4(f2, 3.0f);
+  cmpFloat4(f1, 2.0f);
+  f2 = ++f1;
+  cmpFloat4(f1, 3.0f);
+  cmpFloat4(f2, 3.0f);
+  f2 = --f1;
+  cmpFloat4(f1, 2.0f);
+  cmpFloat4(f1, 2.0f);
+
+  f1.x = 3.0f;
+  f1.y = 3.0f;
+  f1.z = 3.0f;
+  f1.w = 3.0f;
+  f2.x = 4.0f;
+  f2.y = 4.0f;
+  f2.z = 4.0f;
+  f2.w = 4.0f;
+  f3.x = 3.0f;
+  f3.y = 3.0f;
+  f3.z = 3.0f;
+  f3.w = 3.0f;
+  assert((f1 == f2) == false);
+  assert((f1 != f2) == true);
+  assert((f1 < f2) == true);
+  assert((f2 > f1) == true);
+  assert((f1 >= f3) == true);
+  assert((f1 <= f3) == true);
+
+  return true;
+}
+
+
+
+int main() {
+  assert(sizeof(float1) == 4);
+  assert(sizeof(float2) == 8);
+  assert(sizeof(float3) == 12);
+  assert(sizeof(float4) == 16);
+  assert(TestFloat1() && TestFloat2() && TestFloat3() && TestFloat4()
+    && TestUChar1() && TestUChar2() && TestUChar3() && TestUChar4()
+    && TestChar1() && TestChar2() && TestChar3() && TestChar4()
+    && TestUShort1() && TestUShort2() && TestUShort3() && TestUShort4()
+    && TestShort1() && TestShort2() && TestShort3() && TestShort4()
+    && TestUInt1() && TestUInt2() && TestUInt3() && TestUInt4()
+    && TestInt1() && TestInt2() && TestInt3() && TestInt4()
+    && TestULong1() && TestULong2() && TestULong3() && TestULong4()
+    && TestLong1() && TestLong2() && TestLong3() && TestLong4() == true);
+
+  float1 f1 = make_float1(1.0f);
+}

From 73fcce26f980367df57b28906f27e065365786ff Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 11 Jan 2017 18:05:41 -0600
Subject: [PATCH 13/18] changed copyright year from 2016 to 2017 in src
 directory

Change-Id: Idb97db509b2b4b1656b2df7a14a62ade38c9d574
---
 src/device_functions.cpp |  4 +---
 src/device_util.cpp      |  2 +-
 src/device_util.h        |  2 +-
 src/hip_context.cpp      |  2 +-
 src/hip_device.cpp       |  3 +--
 src/hip_error.cpp        |  2 +-
 src/hip_event.cpp        |  6 ++----
 src/hip_fp16.cpp         |  2 +-
 src/hip_hcc.cpp          | 22 +++++++++++-----------
 src/hip_hcc.h            | 14 +++++++-------
 src/hip_ldg.cpp          |  2 +-
 src/hip_memory.cpp       |  4 ++--
 src/hip_module.cpp       |  4 +---
 src/hip_peer.cpp         | 26 +++++++++++++-------------
 src/hip_stream.cpp       |  4 ++--
 src/hip_util.h           |  2 +-
 src/trace_helper.h       | 20 ++++++++++----------
 17 files changed, 57 insertions(+), 64 deletions(-)

diff --git a/src/device_functions.cpp b/src/device_functions.cpp
index 30a09e6e02..0de0cf7f6b 100644
--- a/src/device_functions.cpp
+++ b/src/device_functions.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -54,5 +54,3 @@ __device__ double __hiloint2double (int hi, int lo) {
   s.s2.lo = lo;
   return s.d;
 }
-
-
diff --git a/src/device_util.cpp b/src/device_util.cpp
index 669fcb7570..db0f494af4 100644
--- a/src/device_util.cpp
+++ b/src/device_util.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/src/device_util.h b/src/device_util.h
index 8ccf5a540e..18811321d9 100644
--- a/src/device_util.h
+++ b/src/device_util.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/src/hip_context.cpp b/src/hip_context.cpp
index b38a6c3b74..6c862b114b 100644
--- a/src/hip_context.cpp
+++ b/src/hip_context.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/src/hip_device.cpp b/src/hip_device.cpp
index 0f2c2e2753..131526b6e2 100644
--- a/src/hip_device.cpp
+++ b/src/hip_device.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -479,4 +479,3 @@ hipError_t hipChooseDevice( int* device, const hipDeviceProp_t* prop )
     }
     return ihipLogStatus(e);
 }
-
diff --git a/src/hip_error.cpp b/src/hip_error.cpp
index 60c45cc1f7..4c14ba4156 100644
--- a/src/hip_error.cpp
+++ b/src/hip_error.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/src/hip_event.cpp b/src/hip_event.cpp
index 74fe487968..5a0ed9d8f8 100644
--- a/src/hip_event.cpp
+++ b/src/hip_event.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -43,7 +43,7 @@ hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
         eh->_stream = NULL;
         eh->_flags  = flags;
         eh->_timestamp  = 0;
-        *event = eh; 
+        *event = eh;
     } else {
         e = hipErrorInvalidValue;
     }
@@ -186,5 +186,3 @@ hipError_t hipEventQuery(hipEvent_t event)
         return ihipLogStatus(hipSuccess);
     }
 }
-
-
diff --git a/src/hip_fp16.cpp b/src/hip_fp16.cpp
index 1a9d04474f..0ecac0a6fb 100644
--- a/src/hip_fp16.cpp
+++ b/src/hip_fp16.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/src/hip_hcc.cpp b/src/hip_hcc.cpp
index e5b7937e25..a2383245fc 100644
--- a/src/hip_hcc.cpp
+++ b/src/hip_hcc.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -279,12 +279,12 @@ inline void ihipStream_t::ensureHaveQueue(LockedAccessor_StreamCrit_t &streamCri
         // TODO
         auto needyCritPtr = this->_criticalData.mlock();
 
-        // Second test to ensure we still need to steal the queue - another thread may have 
+        // Second test to ensure we still need to steal the queue - another thread may have
         // snuck in here and already solved the issue.
         if (!needyCritPtr->_hasQueue) {
             needyCritPtr->_av = this->_ctx->stealActiveQueue(ctxCrit, this);
         }
-        
+
         streamCrit->_hasQueue = true;
     }
     assert(streamCrit->_hasQueue);
@@ -394,7 +394,7 @@ LockedAccessor_StreamCrit_t ihipStream_t::lockopen_preKernelCommand()
     }
 
     this->ensureHaveQueue(crit);
-    
+
 
 
     return crit;
@@ -944,10 +944,10 @@ ihipCtx_t::stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *nee
                         uint64_t *p = (uint64_t*)(&victimCritPtr->_av);
                         *p = 0; // damage the victim av so attempt to use it will fault.
 
-                        (*iter)->_criticalData.munlock(); 
+                        (*iter)->_criticalData.munlock();
                         return av;
-                    }  
-                    (*iter)->_criticalData.munlock(); 
+                    }
+                    (*iter)->_criticalData.munlock();
                 }
             }
         }
@@ -1296,7 +1296,7 @@ void ihipInit()
         tokenize(HIP_LAUNCH_BLOCKING_KERNELS, ',', &g_hipLaunchBlockingKernels);
     }
     READ_ENV_I(release, HIP_API_BLOCKING, 0, "Make HIP APIs 'host-synchronous', so they block until completed.  Impacts hipMemcpyAsync, hipMemsetAsync." );
-    
+
 
     READ_ENV_I(release, HIP_MAX_QUEUES, 0, "Maximum number of queues that this app will use per-device.  Additional streams will share the specified number of queues.  0=no limit.");
 
@@ -1320,8 +1320,8 @@ void ihipInit()
 
 
     READ_ENV_I(release, HIP_WAIT_MODE, 0, "Force synchronization mode. 1= force yield, 2=force spin, 0=defaults specified in application");
-    READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copies.1=always use copies, 2=always return false for hipDeviceCanAccessPeer"); 
-    READ_ENV_I(release, HIP_FORCE_SYNC_COPY, 0, "Force all copies (even hipMemcpyAsync) to use sync copies"); 
+    READ_ENV_I(release, HIP_FORCE_P2P_HOST, 0, "Force use of host/staging copy for peer-to-peer copies.1=always use copies, 2=always return false for hipDeviceCanAccessPeer");
+    READ_ENV_I(release, HIP_FORCE_SYNC_COPY, 0, "Force all copies (even hipMemcpyAsync) to use sync copies");
 
     // TODO - review, can we remove this?
     READ_ENV_I(release, HIP_NUM_KERNELS_INFLIGHT, 128, "Max number of inflight kernels per stream before active synchronization is forced.");
@@ -2026,7 +2026,7 @@ hipError_t hipHccGetAcceleratorView(hipStream_t stream, hc::accelerator_view **a
         stream = device->_defaultStream;
     }
 
-    *av = stream->locked_getAv(); // TODO - review.  
+    *av = stream->locked_getAv(); // TODO - review.
 
     hipError_t err = hipSuccess;
     return ihipLogStatus(err);
diff --git a/src/hip_hcc.h b/src/hip_hcc.h
index e19ce63263..031c92fca7 100644
--- a/src/hip_hcc.h
+++ b/src/hip_hcc.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -415,15 +415,15 @@ public:
 
     ihipStreamCriticalBase_t<StreamMutex>  * mlock() { LockedBase<MUTEX_TYPE>::lock(); return this;};
 
-    void munlock() { 
+    void munlock() {
         tprintf(DB_SYNC, "munlocking criticalData=%p for %s...\n", this, ToString(this->_parent).c_str());
-        LockedBase<MUTEX_TYPE>::unlock(); 
+        LockedBase<MUTEX_TYPE>::unlock();
     };
 
-    ihipStreamCriticalBase_t<StreamMutex>  * mtry_lock() { 
+    ihipStreamCriticalBase_t<StreamMutex>  * mtry_lock() {
         bool gotLock = LockedBase<MUTEX_TYPE>::try_lock() ;
         tprintf(DB_SYNC, "mtry_locking=%d criticalData=%p for %s...\n", gotLock, this, ToString(this->_parent).c_str());
-        return gotLock ?  this: nullptr; 
+        return gotLock ?  this: nullptr;
     };
 
 public:
@@ -683,7 +683,7 @@ public: // Functions:
     ihipCtx_t(ihipDevice_t *device, unsigned deviceCnt, unsigned flags); // note: calls constructor for _criticalData
     ~ihipCtx_t();
 
-    // Functions which read or write the critical data are named locked_. 
+    // Functions which read or write the critical data are named locked_.
     // (might be better called "locking_"
     // ihipCtx_t does not use recursive locks so the ihip implementation must avoid calling a locked_ function from within a locked_ function.
     // External functions which call several locked_ functions will acquire and release the lock for each function.  if this occurs in
@@ -697,7 +697,7 @@ public: // Functions:
     hc::accelerator_view  stealActiveQueue(LockedAccessor_CtxCrit_t &ctxCrit, ihipStream_t *needyStream);
     hc::accelerator_view createOrStealQueue(LockedAccessor_CtxCrit_t &ctxCrit);
 
-    ihipCtxCritical_t  &criticalData() { return _criticalData; }; 
+    ihipCtxCritical_t  &criticalData() { return _criticalData; };
 
     const ihipDevice_t *getDevice() const { return _device; };
     int                 getDeviceNum() const { return _device->_deviceId; };
diff --git a/src/hip_ldg.cpp b/src/hip_ldg.cpp
index 075e1926f1..d91f54a807 100644
--- a/src/hip_ldg.cpp
+++ b/src/hip_ldg.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp
index 372d295b89..c43b6991c6 100644
--- a/src/hip_memory.cpp
+++ b/src/hip_memory.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -1087,7 +1087,7 @@ hipError_t hipIpcOpenMemHandle(void** devPtr, hipIpcMemHandle_t handle, unsigned
         hsa_amd_ipc_memory_attach(&handle->ipc_handle, handle->psize, 1, agent, devPtr);
     if(hsa_status != HSA_STATUS_SUCCESS)
         hipStatus = hipErrorMapBufferObjectFailed;
-#else 
+#else
     hipStatus = hipErrorRuntimeOther;
 #endif
     return hipStatus;
diff --git a/src/hip_module.cpp b/src/hip_module.cpp
index 74cdd8a4ae..7d77dbe8dd 100644
--- a/src/hip_module.cpp
+++ b/src/hip_module.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -403,5 +403,3 @@ hipError_t hipModuleLoadData(hipModule_t *module, const void *image)
     }
     return ihipLogStatus(ret);
 }
-
-
diff --git a/src/hip_peer.cpp b/src/hip_peer.cpp
index b7dca06e5f..e57665be0c 100644
--- a/src/hip_peer.cpp
+++ b/src/hip_peer.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -31,7 +31,7 @@ THE SOFTWARE.
 // There are two flavors:
 //   - one where contexts are specified with hipCtx_t type.
 //   - one where contexts are specified with integer deviceIds, that are mapped to the primary context for that device.
-// The implementation contains a set of internal ihip* functions which operate on contexts.  Then the 
+// The implementation contains a set of internal ihip* functions which operate on contexts.  Then the
 // public APIs are thin wrappers which call into this internal implementations.
 // TODO - actually not yet - currently the integer deviceId flavors just call the context APIs.  need to fix.
 
@@ -46,16 +46,16 @@ hipError_t ihipDeviceCanAccessPeer (int* canAccessPeer, hipCtx_t thisCtx, hipCtx
 
         if (thisCtx == peerCtx) {
             *canAccessPeer = 0;
-            tprintf(DB_MEM, "Can't be peer to self. (this=%s, peer=%s)\n", 
-                    thisCtx->toString().c_str(), peerCtx->toString().c_str()); 
+            tprintf(DB_MEM, "Can't be peer to self. (this=%s, peer=%s)\n",
+                    thisCtx->toString().c_str(), peerCtx->toString().c_str());
         } else  if (HIP_FORCE_P2P_HOST & 0x2) {
             *canAccessPeer = false;
-            tprintf(DB_MEM, "HIP_FORCE_P2P_HOST denies peer access this=%s peer=%s  canAccessPeer=%d\n", 
-                    thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer); 
+            tprintf(DB_MEM, "HIP_FORCE_P2P_HOST denies peer access this=%s peer=%s  canAccessPeer=%d\n",
+                    thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer);
         } else {
             *canAccessPeer = peerCtx->getDevice()->_acc.get_is_peer(thisCtx->getDevice()->_acc);
-            tprintf(DB_MEM, "deviceCanAccessPeer this=%s peer=%s  canAccessPeer=%d\n", 
-                    thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer); 
+            tprintf(DB_MEM, "deviceCanAccessPeer this=%s peer=%s  canAccessPeer=%d\n",
+                    thisCtx->toString().c_str(), peerCtx->toString().c_str(), *canAccessPeer);
         }
 
     } else {
@@ -99,14 +99,14 @@ hipError_t ihipDisablePeerAccess (hipCtx_t peerCtx)
             LockedAccessor_CtxCrit_t peerCrit(peerCtx->criticalData());
             bool changed = peerCrit->removePeerWatcher(peerCtx, thisCtx);
             if (changed) {
-                tprintf(DB_MEM, "device %s disable access to memory allocated on peer:%s\n", 
-                                  thisCtx->toString().c_str(), peerCtx->toString().c_str()); 
+                tprintf(DB_MEM, "device %s disable access to memory allocated on peer:%s\n",
+                                  thisCtx->toString().c_str(), peerCtx->toString().c_str());
                 // Update the peers for all memory already saved in the tracker:
                 am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents());
             } else {
                 err = hipErrorPeerAccessNotEnabled; // never enabled P2P access.
             }
-        } 
+        }
     } else {
         err = hipErrorInvalidDevice;
     }
@@ -133,8 +133,8 @@ hipError_t ihipEnablePeerAccess (hipCtx_t peerCtx, unsigned int flags)
             // Add thisCtx to peerCtx's access list so that new allocations on peer will be made visible to this device:
             bool isNewPeer = peerCrit->addPeerWatcher(peerCtx, thisCtx);
             if (isNewPeer) {
-                tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n", 
-                                  thisCtx->toString().c_str(), peerCtx->toString().c_str()); 
+                tprintf(DB_MEM, "device=%s can now see all memory allocated on peer=%s\n",
+                                  thisCtx->toString().c_str(), peerCtx->toString().c_str());
                 am_memtracker_update_peers(peerCtx->getDevice()->_acc, peerCrit->peerCnt(), peerCrit->peerAgents());
             } else {
                 err = hipErrorPeerAccessAlreadyEnabled;
diff --git a/src/hip_stream.cpp b/src/hip_stream.cpp
index aae412160f..594fb6e860 100644
--- a/src/hip_stream.cpp
+++ b/src/hip_stream.cpp
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -214,7 +214,7 @@ hipError_t hipStreamAddCallback(hipStream_t stream, hipStreamCallback_t callback
 {
     HIP_INIT_API(stream, callback, userData, flags);
     hipError_t e = hipSuccess;
-    //--- explicitly synchronize stream to add callback routines 
+    //--- explicitly synchronize stream to add callback routines
     hipStreamSynchronize(stream);
     callback(stream, e, userData);
     return ihipLogStatus(e);
diff --git a/src/hip_util.h b/src/hip_util.h
index 34a80ed205..f6817ffccb 100644
--- a/src/hip_util.h
+++ b/src/hip_util.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/src/trace_helper.h b/src/trace_helper.h
index bde40d0690..f58f81fbff 100644
--- a/src/trace_helper.h
+++ b/src/trace_helper.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -55,7 +55,7 @@ inline std::string ToHexString(T v)
 
 // This is the default which works for most types:
 template <typename T>
-inline std::string ToString(T v) 
+inline std::string ToString(T v)
 {
     std::ostringstream ss;
     ss << v;
@@ -65,7 +65,7 @@ inline std::string ToString(T v)
 
 //  hipEvent_t specialization. TODO - maybe add an event ID for debug?
 template <>
-inline std::string ToString(hipEvent_t v) 
+inline std::string ToString(hipEvent_t v)
 {
     std::ostringstream ss;
     ss << v;
@@ -74,7 +74,7 @@ inline std::string ToString(hipEvent_t v)
 
 //  hipEvent_t specialization. TODO - maybe add an event ID for debug?
 template <>
-inline std::string ToString(hipFunction_t v) 
+inline std::string ToString(hipFunction_t v)
 {
     std::ostringstream ss;
     ss << "0x" << std::hex << v._object;
@@ -85,7 +85,7 @@ inline std::string ToString(hipFunction_t v)
 
 //  hipStream_t
 template <>
-inline std::string ToString(hipStream_t v) 
+inline std::string ToString(hipStream_t v)
 {
     std::ostringstream ss;
     if (v == NULL) {
@@ -99,7 +99,7 @@ inline std::string ToString(hipStream_t v)
 
 //  hipMemcpyKind specialization
 template <>
-inline std::string ToString(hipMemcpyKind v) 
+inline std::string ToString(hipMemcpyKind v)
 {
     switch(v) {
     CASE_STR(hipMemcpyHostToHost);
@@ -113,14 +113,14 @@ inline std::string ToString(hipMemcpyKind v)
 
 
 template <>
-inline std::string ToString(hipError_t v) 
+inline std::string ToString(hipError_t v)
 {
     return ihipErrorString(v);
 };
 
 
 // Catch empty arguments case
-inline std::string ToString() 
+inline std::string ToString()
 {
     return ("");
 }
@@ -129,8 +129,8 @@ inline std::string ToString()
 //---
 // C++11 variadic template - peels off first argument, converts to string, and calls itself again to peel the next arg.
 // Strings are automatically separated by comma+space.
-template <typename T, typename... Args> 
-inline std::string ToString(T first, Args... args) 
+template <typename T, typename... Args>
+inline std::string ToString(T first, Args... args)
 {
     return ToString(first) + ", " + ToString(args...) ;
 }

From 98c4221dc2949882ac3bc6f729ab855c50662a55 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 11 Jan 2017 18:09:33 -0600
Subject: [PATCH 14/18] changed copyright year from 2016 to 2017 in include
 directory

Change-Id: Ib5935a84fb51a04b3446df31cc2287101f791b83
---
 include/hip/device_functions.h | 6 +++---
 include/hip/hcc.h              | 2 +-
 include/hip/hip_common.h       | 6 +++---
 include/hip/hip_complex.h      | 3 +--
 include/hip/hip_fp16.h         | 2 +-
 include/hip/hip_profile.h      | 6 +++---
 include/hip/hip_runtime.h      | 9 ++++-----
 include/hip/hip_runtime_api.h  | 2 +-
 include/hip/hip_texture.h      | 6 +++---
 include/hip/hip_vector_types.h | 6 +++---
 10 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/include/hip/device_functions.h b/include/hip/device_functions.h
index 838bad8f0c..24211b7d2d 100644
--- a/include/hip/device_functions.h
+++ b/include/hip/device_functions.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
@@ -26,8 +26,8 @@ THE SOFTWARE.
 #include <hip/hcc_detail/device_functions.h>
 #elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
 #include <device_functions.h>
-#else 
+#else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
-#endif 
+#endif
 
 #endif
diff --git a/include/hip/hcc.h b/include/hip/hcc.h
index dba26aeab3..9b8a649412 100644
--- a/include/hip/hcc.h
+++ b/include/hip/hcc.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/include/hip/hip_common.h b/include/hip/hip_common.h
index 4c75568f4d..f0e58f1f76 100644
--- a/include/hip/hip_common.h
+++ b/include/hip/hip_common.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -25,7 +25,7 @@ THE SOFTWARE.
 // Common code included at start of every hip file.
 // Auto enable __HIP_PLATFORM_HCC__ if compiling with HCC
 // Other compiler (GCC,ICC,etc) need to set one of these macros explicitly
-#if defined(__HCC__) 
+#if defined(__HCC__)
 #define __HIP_PLATFORM_HCC__
 #define __HIPCC__
 
@@ -37,7 +37,7 @@ THE SOFTWARE.
 #endif
 
 // Auto enable __HIP_PLATFORM_NVCC__ if compiling with NVCC
-#if defined(__NVCC__) 
+#if defined(__NVCC__)
 #define __HIP_PLATFORM_NVCC__
 # ifdef __CUDACC__
 # define __HIPCC__
diff --git a/include/hip/hip_complex.h b/include/hip/hip_complex.h
index 0f4fb0b3d8..ea15137894 100644
--- a/include/hip/hip_complex.h
+++ b/include/hip/hip_complex.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -31,4 +31,3 @@ THE SOFTWARE.
 #else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
 #endif
-
diff --git a/include/hip/hip_fp16.h b/include/hip/hip_fp16.h
index b91063998a..2f64c1a143 100644
--- a/include/hip/hip_fp16.h
+++ b/include/hip/hip_fp16.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/include/hip/hip_profile.h b/include/hip/hip_profile.h
index 489143adfd..e621ae8c79 100644
--- a/include/hip/hip_profile.h
+++ b/include/hip/hip_profile.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -32,7 +32,7 @@ THE SOFTWARE.
 #define HIP_BEGIN_MARKER(markerName, group) amdtBeginMarker(markerName, group, nullptr);
 #define HIP_END_MARKER() amdtEndMarker();
 #else
-#define HIP_SCOPED_MARKER(markerName, group) 
+#define HIP_SCOPED_MARKER(markerName, group)
 #define HIP_BEGIN_MARKER(markerName, group)
-#define HIP_END_MARKER() 
+#define HIP_END_MARKER()
 #endif
diff --git a/include/hip/hip_runtime.h b/include/hip/hip_runtime.h
index dff1e19252..9bc45f300d 100644
--- a/include/hip/hip_runtime.h
+++ b/include/hip/hip_runtime.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -22,7 +22,7 @@ THE SOFTWARE.
 
 //! HIP = Heterogeneous-compute Interface for Portability
 //!
-//! Define a extremely thin runtime layer that allows source code to be compiled unmodified 
+//! Define a extremely thin runtime layer that allows source code to be compiled unmodified
 //! through either AMD HCC or NVCC.   Key features tend to be in the spirit
 //! and terminology of CUDA, but with a portable path to other accelerators as well:
 //
@@ -54,11 +54,10 @@ THE SOFTWARE.
 #include <hip/hcc_detail/hip_runtime.h>
 #elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
 #include <hip/nvcc_detail/hip_runtime.h>
-#else 
+#else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
-#endif 
+#endif
 
 
 #include <hip/hip_runtime_api.h>
 #include <hip/hip_vector_types.h>
-
diff --git a/include/hip/hip_runtime_api.h b/include/hip/hip_runtime_api.h
index a2bfed5c69..28d67fc01a 100644
--- a/include/hip/hip_runtime_api.h
+++ b/include/hip/hip_runtime_api.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/include/hip/hip_texture.h b/include/hip/hip_texture.h
index 3e7802b457..66ec4a6ca1 100644
--- a/include/hip/hip_texture.h
+++ b/include/hip/hip_texture.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -29,9 +29,9 @@ THE SOFTWARE.
 #include <hip/hcc_detail/hip_texture.h>
 #elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
 #include <hip/nvcc_detail/hip_texture.h>
-#else 
+#else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
-#endif 
+#endif
 
 
 #endif
diff --git a/include/hip/hip_vector_types.h b/include/hip/hip_vector_types.h
index 7733d92bda..33827e4d96 100644
--- a/include/hip/hip_vector_types.h
+++ b/include/hip/hip_vector_types.h
@@ -1,5 +1,5 @@
 /*
-Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +33,6 @@ THE SOFTWARE.
 #endif
 #elif defined(__HIP_PLATFORM_NVCC__) && !defined (__HIP_PLATFORM_HCC__)
 #include <vector_types.h>
-#else 
+#else
 #error("Must define exactly one of __HIP_PLATFORM_HCC__ or __HIP_PLATFORM_NVCC__");
-#endif 
+#endif

From e2318cda74077d0424497a6a00ff590285d8c135 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Wed, 11 Jan 2017 18:23:37 -0600
Subject: [PATCH 15/18] changed data type used for complex

Change-Id: I0a3bb281af3d5ac1290207821c7c45aea40f513f
---
 include/hip/hcc_detail/hip_complex.h | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/hip/hcc_detail/hip_complex.h b/include/hip/hcc_detail/hip_complex.h
index 21995de096..f4af5839ad 100644
--- a/include/hip/hcc_detail/hip_complex.h
+++ b/include/hip/hcc_detail/hip_complex.h
@@ -23,10 +23,8 @@ THE SOFTWARE.
 #ifndef HIPCOMPLEX_H
 #define HIPCOMPLEX_H
 
-typedef struct{
-    float x;
-    float y;
-}hipFloatComplex;
+typedef float2 hipFloatComplex;
+typedef double2 hipDoubleComplex;
 
 __device__ static inline float hipCrealf(hipFloatComplex z){
     return z.x;
@@ -79,10 +77,6 @@ __device__ static inline float hipCabsf(hipFloatComplex z){
 }
 
 
-typedef struct{
-    double x;
-    double y;
-}hipDoubleComplex;
 
 __device__ static inline double hipCreal(hipDoubleComplex z){
     return z.x;

From d180fdaae07223e345a630f8f33f5dbb15b9144f Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Thu, 12 Jan 2017 11:30:20 -0600
Subject: [PATCH 16/18] Started adding native half math library support 1.
 Removed HIP_EXPERIMENTAL env variable so that device code will be accessed
 from LLVM IR 2. Removed soft support from headers and moved to hip_fp16.cpp
 3. Added LLVM IR + inline asm to hip_ir.ll 4. Added test for fp16 5. Added
 barriers for hcc 3.5 and hcc 4.0 for half support a. Which means, hcc 4.0 can
 parse __fp16 but hcc 3.5 cant b. HCC 4.0 code is implemented now, hcc 3.5
 will be added later

Change-Id: Ic37859b2688ebb02e168bab643d1882bf4727952
---
 bin/hipcc                           |   2 +-
 include/hip/hcc_detail/hip_fp16.h   | 214 ++++++---------------------
 src/hip_fp16.cpp                    | 220 ++++++++++++++--------------
 src/hip_ir.ll                       |  49 +++++++
 tests/src/deviceLib/hipTestHalf.cpp |  75 ++++++++++
 5 files changed, 278 insertions(+), 282 deletions(-)
 create mode 100644 tests/src/deviceLib/hipTestHalf.cpp

diff --git a/bin/hipcc b/bin/hipcc
index ccea650776..de8f0cb9a3 100755
--- a/bin/hipcc
+++ b/bin/hipcc
@@ -220,7 +220,7 @@ if($HIP_PLATFORM eq "hcc"){
   }
 }
 
-if(($HIP_PLATFORM eq "hcc") and defined $ENV{HIP_EXPERIMENTAL}){
+if(($HIP_PLATFORM eq "hcc")){
     $EXPORT_LL=" ";
     $ENV{HCC_EXTRA_LIBRARIES}="$HIP_PATH/lib/hip_ir.ll\n";
 }
diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h
index d51a5d1fcd..c779bcfba2 100644
--- a/include/hip/hcc_detail/hip_fp16.h
+++ b/include/hip/hcc_detail/hip_fp16.h
@@ -25,213 +25,81 @@ THE SOFTWARE.
 
 #include "hip/hip_runtime.h"
 
-#if 0
+#if __clang_major__ == 4
 
 typedef __fp16 __half;
 
 typedef struct __attribute__((aligned(4))){
-  int a;
+  union {
+    __half p[2];
+    unsigned int q;
+  };
 } __half2;
 
-extern "C" __half __hip_hadd_gfx803(__half a, __half b);
-extern "C" __half __hip_hfma_gfx803(__half a, __half b);
-extern "C" __half __hip_hmul_gfx803(__half a, __half b);
-extern "C" __half __hip_hsub_gfx803(__half a, __half b);
+extern "C" __half __hip_hc_ir_hadd_half(__half, __half);
+extern "C" __half __hip_hc_ir_hfma_half(__half, __half, __half);
+extern "C" __half __hip_hc_ir_hmul_half(__half, __half);
+extern "C" __half __hip_hc_ir_hsub_half(__half, __half);
 
-extern "C" int __hip_hadd2_gfx803(int a, int b);
-extern "C" int __hip_hfma2_gfx803(int a, int b);
-extern "C" int __hip_hmul2_gfx803(int a, int b);
-extern "C" int __hip_hsub2_gfx803(int a, int b);
-
-__device__ inline __half __hadd(__half a, __half b) {
-  return __hip_hadd_gfx803(a, b);
+__device__ static inline __half __hadd(const __half a, const __half b) {
+  return __hip_hc_ir_hadd_half(a, b);
 }
 
-__device__ inline __half __hadd_sat(__half a, __half b) {
-  return __hip_hadd_gfx803(a, b);
+__device__ static inline __half __hadd_sat(__half a, __half b) {
+  return __hip_hc_ir_hadd_half(a, b);
 }
 
-__device__ inline __half __hfma(__half a, __half b) {
-  return __hip_hfma_gfx803(a, b);
+__device__ static inline __half __hfma(__half a, __half b, __half c) {
+  return __hip_hc_ir_hfma_half(a, b, c);
 }
 
-__device__ inline __half __hfma_sat(__half a, __half b) {
-  return __hip_hfma_gfx803(a, b);
+__device__ static inline __half __hfma_sat(__half a, __half b, __half c) {
+  return __hip_hc_ir_hfma_half(a, b, c);
 }
 
-__device__ inline __half __hmul(__half a, __half b) {
-  return __hip_hmul_gfx803(a, b);
+__device__ static inline __half __hmul(__half a, __half b) {
+  return __hip_hc_ir_hmul_half(a, b);
 }
 
-__device__ inline __half __hmul_sat(__half a, __half b) {
-  return __hip_hmul_gfx803(a, b);
+__device__ static inline __half __hmul_sat(__half a, __half b) {
+  return __hip_hc_ir_hmul_half(a, b);
 }
 
-__device__ inline __half __hsub(__half a, __half b) {
-  return __hip_hsub_gfx803(a, b);
+__device__ static inline __half __hneg(__half a) {
+  return -a;
 }
 
-__device__ inline __half __hsub_sat(__half a, __half b) {
-  return __hip_hsub_gfx803(a, b);
+__device__ static inline __half __hsub(__half a, __half b) {
+  return __hip_hc_ir_hsub_half(a, b);
 }
 
-
-__device__ inline __half2 __hadd2(__half2 a, __half2 b) {
-  __half2 ret;
-  ret.a = __hip_hadd2_gfx803(a.a, b.a);
-  return ret;
+__device__ static inline __half __hsub_sat(__half a, __half b) {
+  return __hip_hc_ir_hsub_half(a, b);
 }
 
-#else
+__device__ static inline __half hdiv(__half a, __half b) {
+  return a/b;
+}
 
-typedef struct{
+#endif
+
+#if __clang_major__ == 3
+
+typedef struct {
   unsigned x: 16;
 } __half;
 
-
 typedef struct __attribute__((aligned(4))){
-  __half p,q;
+  union {
+    __half p[2];
+    unsigned int q;
+  };
 } __half2;
 
-typedef __half half;
-typedef __half2 half2;
 
-/*
-Arithmetic functions
-*/
 
-__device__ __half __hadd(const __half a, const __half b);
-
-__device__ __half __hadd_sat(const __half a, const __half b);
-
-__device__ __half __hfma(const __half a, const __half b, const __half c);
-
-__device__ __half __hfma_sat(const __half a, const __half b, const __half c);
-
-__device__ __half __hmul(const __half a, const __half b);
-
-__device__ __half __hmul_sat(const __half a, const __half b);
-
-__device__ __half __hneq(const __half a);
-
-__device__ __half __hsub(const __half a, const __half b);
-
-__device__ __half __hsub_sat(const __half a, const __half b);
-
-
-
-/*
-Half2 Arithmetic Instructions
-*/
-
-__device__ __half2 __hadd2(const __half2 a, const __half2 b);
-
-__device__ __half2 __hadd2_sat(const __half2 a, const __half2 b);
-
-__device__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c);
-
-__device__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c);
-
-__device__ __half2 __hmul2(const __half2 a, const __half2 b);
-
-__device__ __half2 __hmul2_sat(const __half2 a, const __half2 b);
-
-__device__ __half2 __hneq2(const __half2 a);
-
-__device__ __half2 __hsub2(const __half2 a, const __half2 b);
-
-__device__ __half2 __hsub2_sat(const __half2 a, const __half2 b);
-
-/*
-Half Cmps
-*/
-
-__device__  bool __heq(const __half a, const __half b);
-
-__device__ bool __hge(const __half a, const __half b);
-
-__device__ bool __hgt(const __half a, const __half b);
-
-__device__ bool __hisinf(const __half a);
-
-__device__ bool __hisnan(const __half a);
-
-__device__ bool __hle(const __half a, const __half b);
-
-__device__ bool __hlt(const __half a, const __half b);
-
-__device__ bool __hne(const __half a, const __half b);
-
-/*
-Half2 Cmps
-*/
-
-__device__ bool __hbeq2(const __half2 a, const __half2 b);
-
-__device__ bool __hbge2(const __half2 a, const __half2 b);
-
-__device__ bool __hbgt2(const __half2 a, const __half2 b);
-
-__device__ bool __hble2(const __half2 a, const __half2 b);
-
-__device__ bool __hblt2(const __half2 a, const __half2 b);
-
-__device__ bool __hbne2(const __half2 a, const __half2 b);
-
-__device__ __half2 __heq2(const __half2 a, const __half2 b);
-
-__device__ __half2 __hge2(const __half2 a, const __half2 b);
-
-__device__ __half2 __hgt2(const __half2 a, const __half2 b);
-
-__device__ __half2 __hisnan2(const __half2 a);
-
-__device__ __half2 __hle2(const __half2 a, const __half2 b);
-
-__device__ __half2 __hlt2(const __half2 a, const __half2 b);
-
-__device__ __half2 __hne2(const __half2 a, const __half2 b);
-
-
-/*
-Half Cnvs and Data Mvmnt
-*/
-
-__device__ __half2 __float22half2_rn(const float2 a);
-
-__device__ __half __float2half(const float a);
-
-__device__ __half2 __float2half2_rn(const float a);
-
-__device__ __half2 __floats2half2_rn(const float a, const float b);
-
-__device__ float2 __half22float2(const __half2 a);
-
-__device__ float __half2float(const __half a);
-
-__device__ __half2 __half2half2(const __half a);
-
-__device__ __half2 __halves2half2(const __half a, const __half b);
-
-__device__ float __high2float(const __half2 a);
-
-__device__ __half __high2half(const __half2 a);
-
-__device__ __half2 __high2half2(const __half2 a);
-
-__device__ __half2 __highs2half2(const __half2 a, const __half2 b);
-
-__device__ float __low2float(const __half2 a);
-
-__device__ __half __low2half(const __half2 a);
-
-__device__ __half2 __low2half2(const __half2 a);
-
-__device__ __half2 __lows2half2(const __half2 a, const __half2 b);
-
-__device__ __half2 __lowhigh2highlow(const __half2 a);
-
-__device__ __half2 __low2half2(const __half2 a, const __half2 b);
 
 #endif
+
+
 #endif
diff --git a/src/hip_fp16.cpp b/src/hip_fp16.cpp
index 0ecac0a6fb..83e0a161c7 100644
--- a/src/hip_fp16.cpp
+++ b/src/hip_fp16.cpp
@@ -35,6 +35,8 @@ typedef struct{
   };
 } struct_float;
 
+#if __clang_major__ == 3
+
 static __device__ float cvt_half_to_float(__half a){
   struct_float ret = {0};
   if(a.x == 0){
@@ -64,44 +66,44 @@ static __device__ __half cvt_float_to_half(float b){
 }
 
 
-__device__ __half __hadd(const __half a, const __half b){
+__device__ __half __soft_hadd(const __half a, const __half b){
   return cvt_float_to_half(cvt_half_to_float(a)+cvt_half_to_float(b));
 }
 
-__device__ __half __hadd_sat(const __half a, const __half b){
+__device__ __half __soft_hadd_sat(const __half a, const __half b){
   float f = cvt_half_to_float(a) + cvt_half_to_float(b);
   return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
 }
 
-__device__ __half __hfma(const __half a, const __half b, const __half c){
+__device__ __half __soft_hfma(const __half a, const __half b, const __half c){
   return cvt_float_to_half(fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c)));
 }
 
-__device__ __half __hfma_sat(const __half a, const __half b, const __half c){
+__device__ __half __soft_hfma_sat(const __half a, const __half b, const __half c){
   float f = fmaf(cvt_half_to_float(a), cvt_half_to_float(b), cvt_half_to_float(c));
   return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
 }
 
-__device__ __half __hmul(const __half a, const __half b){
+__device__ __half __soft_hmul(const __half a, const __half b){
   return cvt_float_to_half(cvt_half_to_float(a)*cvt_half_to_float(b));
 }
 
-__device__ __half __hmul_sat(const __half a, const __half b){
+__device__ __half __soft_hmul_sat(const __half a, const __half b){
   float f = cvt_half_to_float(a) * cvt_half_to_float(b);
   return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
 }
 
-__device__ __half __hneq(const __half a){
+__device__ __half __soft_hneq(const __half a){
   __half ret = {a.x};
   ret.x ^= 1 << 15;
   return ret;
 }
 
-__device__ __half __hsub(const __half a, const __half b){
+__device__ __half __soft_hsub(const __half a, const __half b){
   return cvt_float_to_half(cvt_half_to_float(a)-cvt_half_to_float(b));
 }
 
-__device__ __half __hsub_sat(const __half a, const __half b){
+__device__ __half __soft_hsub_sat(const __half a, const __half b){
   float f = cvt_half_to_float(a) - cvt_half_to_float(b);
   return (f < 0.0f ? __half_value_zero_float : (f > 1.0f ? __half_value_one_float: cvt_float_to_half(f)));
 }
@@ -111,66 +113,66 @@ __device__ __half __hsub_sat(const __half a, const __half b){
 Half2 Arithmetic Instructions
 */
 
-__device__ __half2 __hadd2(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hadd2(const __half2 a, const __half2 b){
   __half2 ret;
-  ret.p = __hadd(a.p, b.p);
-  ret.q = __hadd(a.q, b.q);
+  ret.p[1] = __soft_hadd(a.p[1], b.p[1]);
+  ret.p[0] = __soft_hadd(a.p[0], b.p[0]);
   return ret;
 }
 
-__device__ __half2 __hadd2_sat(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hadd2_sat(const __half2 a, const __half2 b){
   __half2 ret;
-  ret.p = __hadd_sat(a.p, b.p);
-  ret.q = __hadd_sat(a.q, b.q);
+  ret.p[1] = __soft_hadd_sat(a.p[1], b.p[1]);
+  ret.p[0] = __soft_hadd_sat(a.p[0], b.p[0]);
   return ret;
 }
 
-__device__ __half2 __hfma2(const __half2 a, const __half2 b, const __half2 c){
+__device__ __half2 __soft_hfma2(const __half2 a, const __half2 b, const __half2 c){
   __half2 ret;
-  ret.p = __hfma(a.p, b.p, c.p);
-  ret.q = __hfma(a.q, b.q, c.q);
+  ret.p[1] = __soft_hfma(a.p[1], b.p[1], c.p[1]);
+  ret.p[0] = __soft_hfma(a.p[0], b.p[0], c.p[0]);
   return ret;
 }
 
-__device__ __half2 __hfma2_sat(const __half2 a, const __half2 b, const __half2 c){
+__device__ __half2 __soft_hfma2_sat(const __half2 a, const __half2 b, const __half2 c){
   __half2 ret;
-  ret.p = __hfma_sat(a.p, b.p, c.p);
-  ret.q = __hfma_sat(a.q, b.q, c.q);
+  ret.p[1] = __soft_hfma_sat(a.p[1], b.p[1], c.p[1]);
+  ret.p[0] = __soft_hfma_sat(a.p[0], b.p[0], c.p[0]);
   return ret;
 }
 
-__device__ __half2 __hmul2(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hmul2(const __half2 a, const __half2 b){
   __half2 ret;
-  ret.p = __hmul(a.p, b.p);
-  ret.q = __hmul(a.q, b.q);
+  ret.p[1] = __soft_hmul(a.p[1], b.p[1]);
+  ret.p[0] = __soft_hmul(a.p[0], b.p[0]);
   return ret;
 }
 
-__device__ __half2 __hmul2_sat(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hmul2_sat(const __half2 a, const __half2 b){
   __half2 ret;
-  ret.p = __hmul_sat(a.p, b.p);
-  ret.q = __hmul_sat(a.q, b.q);
+  ret.p[1] = __soft_hmul_sat(a.p[1], b.p[1]);
+  ret.p[0] = __soft_hmul_sat(a.p[0], b.p[0]);
   return ret;
 }
 
-__device__ __half2 __hneq2(const __half2 a){
+__device__ __half2 __soft_hneq2(const __half2 a){
   __half2 ret;
-  ret.p = __hneq(a.p);
-  ret.q = __hneq(a.q);
+  ret.p[1] = __soft_hneq(a.p[1]);
+  ret.p[0] = __soft_hneq(a.p[0]);
   return ret;
 }
 
-__device__ __half2 __hsub2(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hsub2(const __half2 a, const __half2 b){
   __half2 ret;
-  ret.p = __hsub(a.p, b.p);
-  ret.q = __hsub(a.q, b.q);
+  ret.p[1] = __soft_hsub(a.p[1], b.p[1]);
+  ret.p[0] = __soft_hsub(a.p[0], b.p[0]);
   return ret;
 }
 
-__device__ __half2 __hsub2_sat(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hsub2_sat(const __half2 a, const __half2 b){
   __half2 ret;
-  ret.p = __hsub_sat(a.p, b.p);
-  ret.q = __hsub_sat(a.q, b.q);
+  ret.p[1] = __soft_hsub_sat(a.p[1], b.p[1]);
+  ret.p[0] = __soft_hsub_sat(a.p[0], b.p[0]);
   return ret;
 }
 
@@ -178,23 +180,23 @@ __device__ __half2 __hsub2_sat(const __half2 a, const __half2 b){
 Half Cmps
 */
 
-__device__  bool __heq(const __half a, const __half b){
+__device__  bool __soft_heq(const __half a, const __half b){
   return (a.x == b.x ? true:false);
 }
 
-__device__ bool __hge(const __half a, const __half b){
+__device__ bool __soft_hge(const __half a, const __half b){
   return (cvt_half_to_float(a) >= cvt_half_to_float(b));
 }
 
-__device__ bool __hgt(const __half a, const __half b){
+__device__ bool __soft_hgt(const __half a, const __half b){
   return (cvt_half_to_float(a) > cvt_half_to_float(b));
 }
 
-__device__ bool __hisinf(const __half a){
+__device__ bool __soft_hisinf(const __half a){
   return ((a.x == __half_neg_inf) ? -1 : (a.x == __half_pos_inf) ? 1 : 0);
 }
 
-__device__ bool __hisnan(const __half a){
+__device__ bool __soft_hisnan(const __half a){
   if(((a.x & __half_pos_inf) == a.x) || ((a.x & __half_neg_inf) == a.x)){
     return true;
   }else{
@@ -202,15 +204,15 @@ __device__ bool __hisnan(const __half a){
   }
 }
 
-__device__ bool __hle(const __half a, const __half b){
+__device__ bool __soft_hle(const __half a, const __half b){
   return (cvt_half_to_float(a) <= cvt_half_to_float(b));
 }
 
-__device__ bool __hlt(const __half a, const __half b){
+__device__ bool __soft_hlt(const __half a, const __half b){
   return (cvt_half_to_float(a) < cvt_half_to_float(b));
 }
 
-__device__ bool __hne(const __half a, const __half b){
+__device__ bool __soft_hne(const __half a, const __half b){
   return a.x == b.x ? false : true;
 }
 
@@ -218,78 +220,78 @@ __device__ bool __hne(const __half a, const __half b){
 Half2 Cmps
 */
 
-__device__ bool __hbeq2(const __half2 a, const __half2 b){
-  return __heq(a.p, b.p) && __heq(a.q, b.q);
+__device__ bool __soft_hbeq2(const __half2 a, const __half2 b){
+  return __soft_heq(a.p[1], b.p[1]) && __soft_heq(a.p[0], b.p[0]);
 }
 
-__device__ bool __hbge2(const __half2 a, const __half2 b){
-  return __hge(a.p, b.p) && __hge(a.q, b.q);
+__device__ bool __soft_hbge2(const __half2 a, const __half2 b){
+  return __soft_hge(a.p[1], b.p[1]) && __soft_hge(a.p[0], b.p[0]);
 }
 
-__device__ bool __hbgt2(const __half2 a, const __half2 b){
-  return __hgt(a.p, b.p) && __hgt(a.q, b.q);
+__device__ bool __soft_hbgt2(const __half2 a, const __half2 b){
+  return __soft_hgt(a.p[1], b.p[1]) && __soft_hgt(a.p[0], b.p[0]);
 }
 
-__device__ bool __hble2(const __half2 a, const __half2 b){
-  return __hle(a.p, b.p) && __hle(a.q, b.q);
+__device__ bool __soft_hble2(const __half2 a, const __half2 b){
+  return __soft_hle(a.p[1], b.p[1]) && __soft_hle(a.p[0], b.p[0]);
 }
 
-__device__ bool __hblt2(const __half2 a, const __half2 b){
-  return __hlt(a.p, b.p) && __hlt(a.q, b.q);
+__device__ bool __soft_hblt2(const __half2 a, const __half2 b){
+  return __soft_hlt(a.p[1], b.p[1]) && __soft_hlt(a.p[0], b.p[0]);
 }
 
-__device__ bool __hbne2(const __half2 a, const __half2 b){
-  return __hne(a.p, b.p) && __hne(a.q, b.q);
+__device__ bool __soft_hbne2(const __half2 a, const __half2 b){
+  return __soft_hne(a.p[1], b.p[1]) && __soft_hne(a.p[0], b.p[0]);
 }
 
 
 
-__device__ __half2 __heq2(const __half2 a, const __half2 b){
+__device__ __half2 __soft_heq2(const __half2 a, const __half2 b){
   __half2 ret = {0};
-  ret.p = (__heq(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
-  ret.q = (__heq(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[1] = (__soft_heq(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[0] = (__soft_heq(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
   return ret;
 }
 
-__device__ __half2 __hge2(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hge2(const __half2 a, const __half2 b){
   __half2 ret = {0};
-  ret.p = (__hge(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
-  ret.q = (__hge(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[1] = (__soft_hge(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[0] = (__soft_hge(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
   return ret;
 }
 
-__device__ __half2 __hgt2(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hgt2(const __half2 a, const __half2 b){
   __half2 ret = {0};
-  ret.p = (__hgt(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
-  ret.q = (__hgt(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[1] = (__soft_hgt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[0] = (__soft_hgt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
   return ret;
 }
 
-__device__ __half2 __hisnan2(const __half2 a){
+__device__ __half2 __soft_hisnan2(const __half2 a){
   __half2 ret = {0};
-  ret.p = __hisnan(a.p) ? __half_value_one_float : __half_value_zero_float;
-  ret.q = __hisnan(a.q) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[1] = __soft_hisnan(a.p[1]) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[0] = __soft_hisnan(a.p[0]) ? __half_value_one_float : __half_value_zero_float;
   return ret;
 }
 
-__device__ __half2 __hle2(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hle2(const __half2 a, const __half2 b){
   __half2 ret = {0};
-  ret.p = (__hle(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
-  ret.q = (__hle(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[1] = (__soft_hle(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[0] = (__soft_hle(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
   return ret;
 }
 
-__device__ __half2 __hlt2(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hlt2(const __half2 a, const __half2 b){
   __half2 ret = {0};
-  ret.p = (__hlt(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
-  ret.q = (__hlt(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[1] = (__soft_hlt(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[0] = (__soft_hlt(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
   return ret;
 }
 
-__device__ __half2 __hne2(const __half2 a, const __half2 b){
+__device__ __half2 __soft_hne2(const __half2 a, const __half2 b){
   __half2 ret = {0};
-  ret.p = (__hne(a.p, b.p)) ? __half_value_one_float : __half_value_zero_float;
-  ret.q = (__hne(a.q, b.q)) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[1] = (__soft_hne(a.p[1], b.p[1])) ? __half_value_one_float : __half_value_zero_float;
+  ret.p[0] = (__soft_hne(a.p[0], b.p[0])) ? __half_value_one_float : __half_value_zero_float;
   return ret;
 }
 
@@ -297,78 +299,80 @@ __device__ __half2 __hne2(const __half2 a, const __half2 b){
 Half Cnvs and Data Mvmnt
 */
 
-__device__ __half2 __float22half2_rn(const float2 a){
+__device__ __half2 __soft_float22half2_rn(const float2 a){
   __half2 ret = {0};
-  ret.p = cvt_float_to_half(a.x);
-  ret.q = cvt_float_to_half(a.y);
+  ret.p[1] = cvt_float_to_half(a.x);
+  ret.p[0] = cvt_float_to_half(a.y);
   return ret;
 }
 
-__device__ __half __float2half(const float a){
+__device__ __half __soft_float2half(const float a){
   return cvt_float_to_half(a);
 }
 
-__device__ __half2 __float2half2_rn(const float a){
+__device__ __half2 __soft_float2half2_rn(const float a){
   __half ret = cvt_float_to_half(a);
   return {ret, ret};
 }
 
-__device__ __half2 __floats2half2_rn(const float a, const float b){
+__device__ __half2 __soft_floats2half2_rn(const float a, const float b){
   return {cvt_float_to_half(a), cvt_float_to_half(b)};
 }
 
-__device__ float2 __half22float2(const __half2 a){
-  return {cvt_half_to_float(a.p), cvt_half_to_float(a.q)};
+__device__ float2 __soft_half22float2(const __half2 a){
+  return {cvt_half_to_float(a.p[1]), cvt_half_to_float(a.p[0])};
 }
 
-__device__ float __half2float(const __half a){
+__device__ float __soft_half2float(const __half a){
   return cvt_half_to_float(a);
 }
 
-__device__ __half2 __half2half2(const __half a){
+__device__ __half2 __soft_half2half2(const __half a){
   return {a,a};
 }
 
-__device__ __half2 __halves2half2(const __half a, const __half b){
+__device__ __half2 __soft_halves2half2(const __half a, const __half b){
   return {a,b};
 }
 
-__device__ float __high2float(const __half2 a){
-  return cvt_half_to_float(a.p);
+__device__ float __soft_high2float(const __half2 a){
+  return cvt_half_to_float(a.p[1]);
 }
 
-__device__ __half __high2half(const __half2 a){
-  return a.p;
+__device__ __half __soft_high2half(const __half2 a){
+  return a.p[1];
 }
 
-__device__ __half2 __high2half2(const __half2 a){
-  return {a.p, a.p};
+__device__ __half2 __soft_high2half2(const __half2 a){
+  return {a.p[1], a.p[1]};
 }
 
-__device__ __half2 __highs2half2(const __half2 a, const __half2 b){
-  return {a.p, b.p};
+__device__ __half2 __soft_highs2half2(const __half2 a, const __half2 b){
+  return {a.p[1], b.p[1]};
 }
 
-__device__ float __low2float(const __half2 a){
-  return cvt_half_to_float(a.q);
+__device__ float __soft_low2float(const __half2 a){
+  return cvt_half_to_float(a.p[0]);
 }
 
-__device__ __half __low2half(const __half2 a){
-  return a.q;
+__device__ __half __soft_low2half(const __half2 a){
+  return a.p[0];
 }
 
-__device__ __half2 __low2half2(const __half2 a){
-  return {a.q, a.q};
+__device__ __half2 __soft_low2half2(const __half2 a){
+  return {a.p[0], a.p[0]};
 }
 
-__device__ __half2 __lows2half2(const __half2 a, const __half2 b){
-  return {a.q, b.q};
+__device__ __half2 __soft_lows2half2(const __half2 a, const __half2 b){
+  return {a.p[0], b.p[0]};
 }
 
-__device__ __half2 __lowhigh2highlow(const __half2 a){
-  return {a.q, a.p};
+__device__ __half2 __soft_lowhigh2highlow(const __half2 a){
+  return {a.p[0], a.p[1]};
 }
 
-__device__ __half2 __low2half2(const __half2 a, const __half2 b){
-  return {a.q, b.q};
+__device__ __half2 __soft_low2half2(const __half2 a, const __half2 b){
+  return {a.p[0], b.p[0]};
 }
+
+#endif
diff --git a/src/hip_ir.ll b/src/hip_ir.ll
index 472038df6a..202bf9f215 100644
--- a/src/hip_ir.ll
+++ b/src/hip_ir.ll
@@ -12,6 +12,55 @@ define linkonce_odr spir_func void @__threadfence_block()  #1 {
     ret void
 }
 
+; Lightning does not support inline asm for 16-bit data types
+; So, bitcast half to short and then extend to 32bit i32
+; After inline asm, convert back to half
+define half @__hip_hc_ir_hadd_half(half %a, half %b) #1 {
+  %1 = bitcast half %a to i16
+  %2 = bitcast half %b to i16
+  %3 = zext i16 %1 to i32
+  %4 = zext i16 %2 to i32
+  %5 = tail call i32 asm "v_add_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
+  %6 = trunc i32 %5 to i16
+  %7 = bitcast i16 %6 to half
+  ret half %7
+}
+
+define half @__hip_hc_ir_hsub_half(half %a, half %b) #1 {
+  %1 = bitcast half %a to i16
+  %2 = bitcast half %b to i16
+  %3 = zext i16 %1 to i32
+  %4 = zext i16 %2 to i32
+  %5 = tail call i32 asm "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
+  %6 = trunc i32 %5 to i16
+  %7 = bitcast i16 %6 to half
+  ret half %7
+}
+
+define half @__hip_hc_ir_hmul_half(half %a, half %b) #1 {
+  %1 = bitcast half %a to i16
+  %2 = bitcast half %b to i16
+  %3 = zext i16 %1 to i32
+  %4 = zext i16 %2 to i32
+  %5 = tail call i32 asm "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %3, i32 %4)
+  %6 = trunc i32 %5 to i16
+  %7 = bitcast i16 %6 to half
+  ret half %7
+}
+
+define half @__hip_hc_ir_hfma_half(half %a, half %b, half %c) #1 {
+  %1 = bitcast half %a to i16
+  %2 = bitcast half %b to i16
+  %3 = bitcast half %c to i16
+  %4 = zext i16 %1 to i32
+  %5 = zext i16 %2 to i32
+  %6 = zext i16 %3 to i32
+  %7 = tail call i32 asm "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %4, i32 %5, i32 %6)
+  %8 = trunc i32 %7 to i16
+  %9 = bitcast i16 %8 to half
+  ret half %9
+}
+
 
 
 attributes #1 = { alwaysinline nounwind }
diff --git a/tests/src/deviceLib/hipTestHalf.cpp b/tests/src/deviceLib/hipTestHalf.cpp
new file mode 100644
index 0000000000..9533bf34ca
--- /dev/null
+++ b/tests/src/deviceLib/hipTestHalf.cpp
@@ -0,0 +1,75 @@
+/*
+Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+/* HIT_START
+ * BUILD: %t %s ../test_common.cpp
+ * RUN: %t
+ * HIT_END
+ */
+
+#include "test_common.h"
+#include "hip/hip_runtime.h"
+#include "hip/hip_runtime_api.h"
+#include "hip/hip_fp16.h"
+
+#define hInf     0x7C00
+#define hInfPK   0x7C007C00
+#define h65504   0xF7FF
+#define h65504PK 0xF7FFF7FF
+#define h27      0x4EC0
+#define h27PK    0x4EC04EC0
+#define h7       0x4700
+#define h7PK     0x47004700
+#define h3       0x4200
+#define h3PK     0x42004200
+#define h1       0x3C00
+#define h1PK     0x3C003C00
+#define hPoint5     0x3800
+#define hPoint5PK   0x38003800
+#define hZero    0x0000
+#define hNeg1    0xBC00
+#define hNeg1PK 0xBC00BC00
+
+struct holder{
+union{
+  __half a;
+  unsigned short b;
+};
+};
+
+__global__ void CheckHalf(hipLaunchParm lp, __half* In1, __half* In2, __half* In3, __half* Out){
+  Out[0] = __hadd(In1[0], In2[0]);
+  Out[1] = __hadd_sat(In1[1], In2[1]);
+  Out[2] = __hfma(In1[2], In2[2],In3[2]);
+  Out[3] = __hfma_sat(In1[3], In2[3], In3[3]);
+  Out[4] = __hmul(In1[4], In2[4]);
+  Out[5] = __hmul_sat(In1[5], In2[5]);
+  Out[6] = __hneg(In1[6]);
+  Out[7] = __hsub(In1[7], In2[7]);
+  Out[8] = __hsub_sat(In1[8], In2[8]);
+  Out[9] = hdiv(In1[9], In2[9]);
+}
+
+
+int main(){
+
+}

From 5ef8ef3bd7b95cba058dbce318e8b98c677bbf29 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Thu, 12 Jan 2017 14:10:51 -0600
Subject: [PATCH 17/18] added packed math fp16 native device functions 1. Added
 SDWA implementation inside IR file 2. Added device functions to header + used
 them in test

Change-Id: Ib4e059a58eee201cc82438689e3e9bc5f9d26653
---
 include/hip/hcc_detail/hip_fp16.h   | 71 +++++++++++++++++++++++++++++
 src/hip_ir.ll                       | 23 ++++++++++
 tests/src/deviceLib/hipTestHalf.cpp | 12 +++++
 3 files changed, 106 insertions(+)

diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h
index c779bcfba2..2ef8d330e7 100644
--- a/include/hip/hcc_detail/hip_fp16.h
+++ b/include/hip/hcc_detail/hip_fp16.h
@@ -41,6 +41,11 @@ extern "C" __half __hip_hc_ir_hfma_half(__half, __half, __half);
 extern "C" __half __hip_hc_ir_hmul_half(__half, __half);
 extern "C" __half __hip_hc_ir_hsub_half(__half, __half);
 
+extern "C" int __hip_hc_ir_hadd2_int(int, int);
+extern "C" int __hip_hc_ir_hfma2_int(int, int, int);
+extern "C" int __hip_hc_ir_hmul2_int(int, int);
+extern "C" int __hip_hc_ir_hsub2_int(int, int);
+
 __device__ static inline __half __hadd(const __half a, const __half b) {
   return __hip_hc_ir_hadd_half(a, b);
 }
@@ -81,6 +86,72 @@ __device__ static inline __half hdiv(__half a, __half b) {
   return a/b;
 }
 
+/*
+  Half2 Arithmetic Functions
+*/
+
+__device__ static inline __half2 __hadd2(__half2 a, __half2 b) {
+  __half2 c;
+  c.q = __hip_hc_ir_hadd2_int(a.q, b.q);
+  return c;
+}
+
+__device__ static inline __half2 __hadd2_sat(__half2 a, __half2 b) {
+  __half2 c;
+  c.q = __hip_hc_ir_hadd2_int(a.q, b.q);
+  return c;
+}
+
+__device__ static inline __half2 __hfma2(__half2 a, __half2 b, __half2 c) {
+  __half2 d;
+  d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q);
+  return d;
+}
+
+__device__ static inline __half2 __hfma2_sat(__half2 a, __half2 b, __half2 c) {
+  __half2 d;
+  d.q = __hip_hc_ir_hfma2_int(a.q, b.q, c.q);
+  return d;
+}
+
+__device__ static inline __half2 __hmul2(__half2 a, __half2 b) {
+  __half2 c;
+  c.q = __hip_hc_ir_hmul2_int(a.q, b.q);
+  return c;
+}
+
+__device__ static inline __half2 __hmul2_sat(__half2 a, __half2 b) {
+  __half2 c;
+  c.q = __hip_hc_ir_hmul2_int(a.q, b.q);
+  return c;
+}
+
+__device__ static inline __half2 __hsub2(__half2 a, __half2 b) {
+  __half2 c;
+  c.q = __hip_hc_ir_hsub2_int(a.q, b.q);
+  return c;
+}
+
+__device__ static inline __half2 __hneg2(__half2 a) {
+  __half2 c;
+  c.p[0] = - a.p[0];
+  c.p[1] = - a.p[1];
+  return c;
+}
+
+__device__ static inline __half2 __hsub2_sat(__half2 a, __half2 b) {
+  __half2 c;
+  c.q = __hip_hc_ir_hsub2_int(a.q, b.q);
+  return c;
+}
+
+__device__ static inline __half2 h2div(__half2 a, __half2 b) {
+  __half2 c;
+  c.p[0] = a.p[0] / b.p[0];
+  c.p[1] = a.p[1] / b.p[1];
+  return c;
+}
+
 #endif
 
 #if __clang_major__ == 3
diff --git a/src/hip_ir.ll b/src/hip_ir.ll
index 202bf9f215..52460a38bb 100644
--- a/src/hip_ir.ll
+++ b/src/hip_ir.ll
@@ -61,6 +61,29 @@ define half @__hip_hc_ir_hfma_half(half %a, half %b, half %c) #1 {
   ret half %9
 }
 
+define i32 @__hip_hc_ir_hadd2_int(i32 %a, i32 %b) #1 {
+  %1 = tail call i32 asm sideeffect "v_add_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
+  tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
+  ret i32 %1
+}
 
+define i32 @__hip_hc_ir_hfma2_int(i32 %a, i32 %b, i32 %c) #1 {
+  %1 = tail call i32 asm sideeffect "v_mad_f16 $0, $1, $2, $3","=v,v,v,v"(i32 %a, i32 %b, i32 %c)
+  tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
+  tail call void asm sideeffect "v_add_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %1, i32 %c)
+  ret i32 %1
+}
+
+define i32 @__hip_hc_ir_hmul2_int(i32 %a, i32 %b) #1 {
+  %1 = tail call i32 asm sideeffect "v_mul_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
+  tail call void asm sideeffect "v_mul_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
+  ret i32 %1
+}
+
+define i32 @__hip_hc_ir_hsub2_int(i32 %a, i32 %b) #1 {
+  %1 = tail call i32 asm sideeffect "v_sub_f16 $0, $1, $2","=v,v,v"(i32 %a, i32 %b)
+  tail call void asm sideeffect "v_sub_f16_sdwa $0, $1, $2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_1 src1_sel:WORD_1","v,v,v"(i32 %1, i32 %a, i32 %b)
+  ret i32 %1
+}
 
 attributes #1 = { alwaysinline nounwind }
diff --git a/tests/src/deviceLib/hipTestHalf.cpp b/tests/src/deviceLib/hipTestHalf.cpp
index 9533bf34ca..2c01c5cb72 100644
--- a/tests/src/deviceLib/hipTestHalf.cpp
+++ b/tests/src/deviceLib/hipTestHalf.cpp
@@ -69,6 +69,18 @@ __global__ void CheckHalf(hipLaunchParm lp, __half* In1, __half* In2, __half* In
   Out[9] = hdiv(In1[9], In2[9]);
 }
 
+__global__ void CheckHalf2(hipLaunchParm lp, __half2* In1, __half2* In2, __half2* In3, __half2* Out){
+  Out[0] = __hadd2(In1[0], In2[0]);
+  Out[1] = __hadd2_sat(In1[1], In2[1]);
+  Out[2] = __hfma2(In1[2], In2[2],In3[2]);
+  Out[3] = __hfma2_sat(In1[3], In2[3], In3[3]);
+  Out[4] = __hmul2(In1[4], In2[4]);
+  Out[5] = __hmul2_sat(In1[5], In2[5]);
+  Out[6] = __hneg2(In1[6]);
+  Out[7] = __hsub2(In1[7], In2[7]);
+  Out[8] = __hsub2_sat(In1[8], In2[8]);
+  Out[9] = h2div(In1[9], In2[9]);
+}
 
 int main(){
 

From 2dcd7600dc6c413e47d3be5ac109f9a875e16c69 Mon Sep 17 00:00:00 2001
From: Aditya Atluri <Aditya.Atluri@amd.com>
Date: Thu, 12 Jan 2017 14:52:14 -0600
Subject: [PATCH 18/18] added comparision device functions for fp16 1. Added
 comparision device functions 2. Added test to check correct isa getting
 generated

Change-Id: I16732f5a1438bdce145f7bfcecd28198e3cc4b79
---
 include/hip/hcc_detail/hip_fp16.h   | 75 +++++++++++++++++++++++++++++
 tests/src/deviceLib/hipTestHalf.cpp | 18 ++++---
 2 files changed, 86 insertions(+), 7 deletions(-)

diff --git a/include/hip/hcc_detail/hip_fp16.h b/include/hip/hcc_detail/hip_fp16.h
index 2ef8d330e7..e4408556f1 100644
--- a/include/hip/hcc_detail/hip_fp16.h
+++ b/include/hip/hcc_detail/hip_fp16.h
@@ -36,6 +36,17 @@ typedef struct __attribute__((aligned(4))){
   };
 } __half2;
 
+struct holder{
+  union {
+    __half h;
+    unsigned short s;
+  };
+};
+
+#define HINF 65504
+
+static struct holder hInf = {HINF};
+
 extern "C" __half __hip_hc_ir_hadd_half(__half, __half);
 extern "C" __half __hip_hc_ir_hfma_half(__half, __half, __half);
 extern "C" __half __hip_hc_ir_hmul_half(__half, __half);
@@ -152,6 +163,70 @@ __device__ static inline __half2 h2div(__half2 a, __half2 b) {
   return c;
 }
 
+/*
+Half comparision Functions
+*/
+
+__device__ static inline bool __heq(__half a, __half b) {
+  return a == b ? true : false;
+}
+
+__device__ static inline bool __hge(__half a, __half b) {
+  return a >= b ? true : false;
+}
+
+__device__ static inline bool __hgt(__half a, __half b) {
+  return a > b ? true : false;
+}
+
+__device__ static inline bool __hisinf(__half a) {
+  return a == hInf.s ? true : false;
+}
+
+__device__ static inline bool __hisnan(__half a) {
+  return a > hInf.s ? true : false;
+}
+
+__device__ static inline bool __hle(__half a, __half b) {
+  return a <= b ? true : false;
+}
+
+__device__ static inline bool __hlt(__half a, __half b) {
+  return a < b ? true : false;
+}
+
+__device__ static inline bool __hne(__half a, __half b) {
+  return a != b ? true : false;
+}
+
+/*
+Half2 Comparision Functions
+*/
+
+__device__ static inline bool __hbeq2(__half2 a, __half2 b) {
+  return (a.p[0] == b.p[0] ? true : false) && (a.p[1] == b.p[1] ? true : false);
+}
+
+__device__ static inline bool __hbge2(__half2 a, __half2 b) {
+  return (a.p[0] >= b.p[0] ? true : false) && (a.p[1] >= b.p[1] ? true : false);
+}
+
+__device__ static inline bool __hbgt2(__half2 a, __half2 b) {
+  return (a.p[0] > b.p[0] ? true : false) && (a.p[1] > b.p[1] ? true : false);
+}
+
+__device__ static inline bool __hble2(__half2 a, __half2 b) {
+  return (a.p[0] <= b.p[0] ? true : false) && (a.p[1] <= b.p[1] ? true : false);
+}
+
+__device__ static inline bool __hblt2(__half2 a, __half2 b) {
+  return (a.p[0] < b.p[0] ? true : false) && (a.p[1] < b.p[1] ? true : false);
+}
+
+__device__ static inline bool __hbne2(__half2 a, __half2 b) {
+  return (a.p[0] != b.p[0] ? true : false) && (a.p[1] != b.p[1] ? true : false);
+}
+
 #endif
 
 #if __clang_major__ == 3
diff --git a/tests/src/deviceLib/hipTestHalf.cpp b/tests/src/deviceLib/hipTestHalf.cpp
index 2c01c5cb72..46927c3902 100644
--- a/tests/src/deviceLib/hipTestHalf.cpp
+++ b/tests/src/deviceLib/hipTestHalf.cpp
@@ -49,13 +49,6 @@ THE SOFTWARE.
 #define hNeg1    0xBC00
 #define hNeg1PK 0xBC00BC00
 
-struct holder{
-union{
-  __half a;
-  unsigned short b;
-};
-};
-
 __global__ void CheckHalf(hipLaunchParm lp, __half* In1, __half* In2, __half* In3, __half* Out){
   Out[0] = __hadd(In1[0], In2[0]);
   Out[1] = __hadd_sat(In1[1], In2[1]);
@@ -82,6 +75,17 @@ __global__ void CheckHalf2(hipLaunchParm lp, __half2* In1, __half2* In2, __half2
   Out[9] = h2div(In1[9], In2[9]);
 }
 
+__global__ void CheckCmpHalf(hipLaunchParm lp, __half* In1, __half* In2, bool* Out) {
+  Out[0] = __heq(In1[0], In2[0]);
+  Out[1] = __hge(In1[1], In2[1]);
+  Out[2] = __hgt(In1[2], In2[2]);
+  Out[3] = __hisinf(In1[3]);
+  Out[4] = __hisnan(In1[4]);
+  Out[5] = __hle(In1[5], In2[5]);
+  Out[6] = __hlt(In1[6], In2[6]);
+  Out[7] = __hne(In1[7], In2[7]);
+}
+
 int main(){
 
 }