From 96a7f1853e31b60f809e1d1ff034a2aadc8351d0 Mon Sep 17 00:00:00 2001
From: Sandeep Kumar <Sandeep.Kumar@amd.com>
Date: Thu, 11 May 2017 16:33:31 +0530
Subject: [PATCH] Add inline asm hip directed tests for v_add and v_mac

Change-Id: Ie5ace2e42d5da89b16e040537df2bb13d3883c6d


[ROCm/hip commit: 830f2b100d59d6cc58301db68d55f01981f8fbde]
---
 .../hip/tests/src/kernel/inline_asm_vadd.cpp  | 126 ++++++++++++++++++
 .../hip/tests/src/kernel/inline_asm_vmac.cpp  | 125 +++++++++++++++++
 2 files changed, 251 insertions(+)
 create mode 100644 projects/hip/tests/src/kernel/inline_asm_vadd.cpp
 create mode 100644 projects/hip/tests/src/kernel/inline_asm_vmac.cpp
diff --git a/projects/hip/tests/src/kernel/inline_asm_vadd.cpp b/projects/hip/tests/src/kernel/inline_asm_vadd.cpp
new file mode 100644
index 0000000000..481b606e89
--- /dev/null
+++ b/projects/hip/tests/src/kernel/inline_asm_vadd.cpp
@@ -0,0 +1,126 @@
+/* Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
+associated documentation files (the "Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the
+following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT
+LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO
+EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
+
+/* HIT_START
+ * BUILD: %t %s
+ * RUN: %t
+ * HIT_END
+ */
+
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define NUM       1024
+
+#define THREADS_PER_BLOCK_X  4
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void vadd_asm(hipLaunchParm lp,
+                                float *out,
+                                float *in)
+{
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+
+    asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i]));
+}
+
+// CPU implementation of Vector Result
+void addCPUReference(
+    float * output,
+    float * input)
+{
+    for(unsigned int j=0; j < NUM; j++)
+    {
+        
+        output[j]= input[j] + output[j];
+    }
+}
+
+int main(){
+
+    float* VectorA;
+    float* ResultVector;
+    float* VectorB;
+
+    float* gpuVector;
+    float* gpuResultVector;
+
+    int i;
+    int errors;
+
+    VectorA = (float*)malloc(NUM * sizeof(float));
+    ResultVector = (float*)malloc(NUM * sizeof(float));
+    VectorB = (float*)malloc(NUM * sizeof(float));
+
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+    VectorA[i] = (float)i*10.0f;
+    VectorB[i] = (float)i*30.0f;
+    }
+
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuVector, NUM * sizeof(float));
+    hipMalloc((void**)&gpuResultVector, NUM * sizeof(float));
+
+    // Memory transfer from host to device
+    hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+    // Lauching kernel from host
+    hipLaunchKernel(vadd_asm,
+                    dim3(NUM/THREADS_PER_BLOCK_X),
+                    dim3(THREADS_PER_BLOCK_X),
+                    0, 0,
+                    gpuResultVector , gpuVector);
+
+    // Memory transfer from device to host
+    hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU Result computation
+    addCPUReference(VectorB, VectorA);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-3;
+    for (i = 0; i < NUM; i++) {
+    if (std::abs(ResultVector[i] - VectorB[i]) > eps ) {
+        errors++;
+        }
+    }
+    if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+    } else {
+        printf ("PASSED!\n");
+    }
+
+    //free the resources on device side
+    hipFree(gpuVector);
+    hipFree(gpuResultVector);
+
+    hipDeviceReset();
+
+    //free the resources on host side
+    free(VectorA);
+    free(ResultVector);
+    free(VectorB);
+
+    return errors;
+}
diff --git a/projects/hip/tests/src/kernel/inline_asm_vmac.cpp b/projects/hip/tests/src/kernel/inline_asm_vmac.cpp
new file mode 100644
index 0000000000..1b6941c249
--- /dev/null
+++ b/projects/hip/tests/src/kernel/inline_asm_vmac.cpp
@@ -0,0 +1,125 @@
+/*
+Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+#include<iostream>
+
+// hip header file
+#include "hip/hip_runtime.h"
+
+#define NUM       1024
+
+#define THREADS_PER_BLOCK_X  4
+
+// Device (Kernel) function, it must be void
+// hipLaunchParm provides the execution configuration
+__global__ void vmac_asm(hipLaunchParm lp,
+                                float *out,
+                                float *in)
+{
+    int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+
+    asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
+}
+
+// CPU implementation of saxpy
+void CPUReference(
+    float * output,
+    float * input)
+{
+    for(unsigned int j=0; j < NUM; j++)
+    {
+        
+        output[j]= a*input[j] + output[j];
+    }
+}
+
+int main(){
+
+    float* VectorA;
+    float* ResultVector;
+    float* VectorB;
+
+    float* gpuVector;
+    float* gpuResultVector;
+
+    const float a = 10.0f
+    int i;
+    int errors;
+
+    VectorA = (float*)malloc(NUM * sizeof(float));
+    ResultVector = (float*)malloc(NUM * sizeof(float));
+    VectorB = (float*)malloc(NUM * sizeof(float));
+
+    // initialize the input data
+    for (i = 0; i < NUM; i++) {
+    VectorA[i] = (float)i*10.0f;
+    VectorB[i] = (float)i*30.0f;
+    }
+
+    // allocate the memory on the device side
+    hipMalloc((void**)&gpuVector, NUM * sizeof(float));
+    hipMalloc((void**)&gpuResultVector, NUM * sizeof(float));
+
+    // Memory transfer from host to device
+    hipMemcpy(gpuVector, VectorA, NUM*sizeof(float), hipMemcpyHostToDevice);
+    hipMemcpy(gpuResultVector, VectorB, NUM*sizeof(float), hipMemcpyHostToDevice);
+
+    // Lauching kernel from host
+    hipLaunchKernel(vmac_asm,
+                    dim3(NUM/THREADS_PER_BLOCK_X),
+                    dim3(THREADS_PER_BLOCK_X),
+                    0, 0,
+                    gpuResultVector , gpuVector);
+
+    // Memory transfer from device to host
+    hipMemcpy(ResultVector, gpuResultVector, NUM*sizeof(float), hipMemcpyDeviceToHost);
+
+    // CPU Result computation
+    addCPUReference(VectorB, VectorA);
+
+    // verify the results
+    errors = 0;
+    double eps = 1.0E-3;
+    for (i = 0; i < NUM; i++) {
+    if (std::abs(ResultVector[i] - VectorB[i]) > eps ) {
+        errors++;
+        }
+    }
+    if (errors!=0) {
+    printf("FAILED: %d errors\n",errors);
+    } else {
+        printf ("PASSED!\n");
+    }
+
+    //free the resources on device side
+    hipFree(gpuVector);
+    hipFree(gpuResultVector);
+
+    hipDeviceReset();
+
+    //free the resources on host side
+    free(VectorA);
+    free(ResultVector);
+    free(VectorB);
+
+    return errors;
+}