SWDEV-286322 - clean up trailing space (#2361)

Change-Id: I03c07e67a8d1fa1a874718ffba43eb396c2aa05c
2021-09-24 06:57:51 -04:00
commit 44581b4d3c
@@ -38,9 +38,9 @@ THE SOFTWARE.
 __global__ void EmptyKernel() { }

 void print_timing(std::string test, const std::array<float, TOTAL_RUN_COUNT> &results, int batch = 1) {
-    
+
    float total_us = 0.0f, mean_us = 0.0f, stddev_us = 0.0f;
-    
+
    // skip warm-up runs
    auto start_iter = std::next(results.begin(), WARMUP_RUN_COUNT);
    auto end_iter = results.end();
@@ -48,7 +48,7 @@ void print_timing(std::string test, const std::array<float, TOTAL_RUN_COUNT> &re
    // mean
    std::for_each(start_iter, end_iter, [&](const float &run_ms) {
        total_us += (run_ms * 1000) / batch;
-    });   
+    });
    mean_us = total_us  / TIMING_RUN_COUNT;

   // stddev
@@ -63,18 +63,18 @@ void print_timing(std::string test, const std::array<float, TOTAL_RUN_COUNT> &re
    printf("\n %s: %.1f us, std: %.1f us\n", test.c_str(), mean_us, stddev_us);
 }

-int main() {   
+int main() {
    hipStream_t stream0 = 0;
    hipDevice_t device;
    hipDeviceGet(&device, 0);
-    hipCtx_t context;     
-    hipCtxCreate(&context, 0, device); 
+    hipCtx_t context;
+    hipCtxCreate(&context, 0, device);
    hipModule_t module;
    hipFunction_t function;
    hipModuleLoad(&module, FILE_NAME);
    hipModuleGetFunction(&function, module, KERNEL_NAME);
    void* params = nullptr;
-    
+
    std::array<float, TOTAL_RUN_COUNT> results;
    hipEvent_t start, stop;
    hipEventCreate(&start);
@@ -83,7 +83,7 @@ int main() {
    /************************************************************************************/
    /* HIP kernel launch enqueue rate:                                                  */
    /* Measure time taken to enqueue a kernel on the GPU                                */
-    /************************************************************************************/ 
+    /************************************************************************************/

    // Timing hipModuleLaunchKernel
    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
@@ -104,8 +104,8 @@ int main() {
    print_timing("hipLaunchKernelGGL enqueue rate", results);

    /***********************************************************************************/
-    /* Single dispatch execution latency using HIP events:                             */   
-    /* Measures latency to start & finish executing a kernel with GPU-scope visibility    */ 
+    /* Single dispatch execution latency using HIP events:                             */
+    /* Measures latency to start & finish executing a kernel with GPU-scope visibility    */
    /***********************************************************************************/

    //Timing around the dispatch
@@ -120,7 +120,7 @@ int main() {

    /*********************************************************************************/
    /* Batch dispatch execution latency using HIP events:                            */
-    /* Measures latency to start & finish executing each dispatch in a batch    */ 
+    /* Measures latency to start & finish executing each dispatch in a batch    */
    /*********************************************************************************/

    for (auto i = 0; i < TOTAL_RUN_COUNT; ++i) {
@@ -29,23 +29,23 @@ THE SOFTWARE.

 // Device (Kernel) function
 __global__ void multiply(float* C, float* A, float* B, int N){
-      
+
    int tx = blockDim.x*blockIdx.x+threadIdx.x;
-    
+
    if (tx < N){
 	C[tx] = A[tx] * B[tx];
    }
 }
 // CPU implementation
 void multiplyCPU(float* C, float* A, float* B, int N){
-    
-    for(unsigned int i=0; i<N; i++){     
-        C[i] = A[i] * B[i];      
+
+    for(unsigned int i=0; i<N; i++){
+        C[i] = A[i] * B[i];
    }
 }

 void launchKernel(float* C, float* A, float* B, bool manual){
-     
+
     hipDeviceProp_t devProp;
     HIP_CHECK(hipGetDeviceProperties(&devProp, 0));

@@ -59,9 +59,9 @@ void launchKernel(float* C, float* A, float* B, bool manual){
     int mingridSize = 0;
     int gridSize = 0;
     int blockSize = 0;
-     
+
     if (manual){
-	blockSize = threadsperblock; 
+	blockSize = threadsperblock;
 	gridSize  = blocks;
 	std::cout << std::endl << "Manual Configuration with block size " << blockSize << std::endl;
     }
@@ -69,15 +69,15 @@ void launchKernel(float* C, float* A, float* B, bool manual){
 	HIP_CHECK(hipOccupancyMaxPotentialBlockSize(&mingridSize, &blockSize, multiply, 0, 0));
 	std::cout << std::endl << "Automatic Configuation based on hipOccupancyMaxPotentialBlockSize " << std::endl;
 	std::cout << "Suggested blocksize is " << blockSize << ", Minimum gridsize is " << mingridSize << std::endl;
-	gridSize = (NUM/blockSize)+1; 
+	gridSize = (NUM/blockSize)+1;
     }

     // Record the start event
-     HIP_CHECK(hipEventRecord(start, NULL));  
+     HIP_CHECK(hipEventRecord(start, NULL));

     // Launching the Kernel from Host
     hipLaunchKernelGGL(multiply, dim3(gridSize), dim3(blockSize), 0, 0, C, A, B, NUM);
-     
+
     // Record the stop event
     HIP_CHECK(hipEventRecord(stop, NULL));
     HIP_CHECK(hipEventSynchronize(stop));
@@ -88,7 +88,7 @@ void launchKernel(float* C, float* A, float* B, bool manual){
     //Calculate Occupancy
     int numBlock = 0;
     HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0));
-     
+
     if(devProp.maxThreadsPerMultiProcessor){
 	std::cout << "Theoretical Occupancy is " << (double)numBlock* blockSize/devProp.maxThreadsPerMultiProcessor * 100 << "%" << std::endl;
     }
@@ -106,26 +106,26 @@ int main() {
     C0 = (float *)malloc(NUM * sizeof(float));
     C1 = (float *)malloc(NUM * sizeof(float));
     cpuC = (float *)malloc(NUM * sizeof(float));
-     
+
     for(i=0; i< NUM; i++){
 	A[i] = i;
 	B[i] = i;
     }
-    
-     // allocate the memory on the device side   
+
+     // allocate the memory on the device side
     HIP_CHECK(hipMalloc((void**)&Ad, NUM * sizeof(float)));
     HIP_CHECK(hipMalloc((void**)&Bd, NUM * sizeof(float)));
     HIP_CHECK(hipMalloc((void**)&C0d, NUM * sizeof(float)));
     HIP_CHECK(hipMalloc((void**)&C1d, NUM * sizeof(float)));
- 
+
     // Memory transfer from host to device
     HIP_CHECK(hipMemcpy(Ad,A,NUM * sizeof(float), hipMemcpyHostToDevice));
     HIP_CHECK(hipMemcpy(Bd,B,NUM * sizeof(float), hipMemcpyHostToDevice));

     //Kernel launch with manual/default block size
     launchKernel(C0d, Ad, Bd, 1);
-     
-     //Kernel launch with the block size suggested by hipOccupancyMaxPotentialBlockSize 
+
+     //Kernel launch with the block size suggested by hipOccupancyMaxPotentialBlockSize
     launchKernel(C1d, Ad, Bd, 0);

     // Memory transfer from device to host
@@ -137,26 +137,26 @@ int main() {

     //verify the results
     double eps = 1.0E-6;
-     
+
       for (i = 0; i < NUM; i++) {
 	  if (std::abs(C0[i] - cpuC[i]) > eps) {
 		  errors++;
 	}
     }
-          
+
     if (errors != 0){
 	printf("\nManual Test FAILED: %d errors\n", errors);
 	errors=0;
     } else {
 	printf("\nManual Test PASSED!\n");
     }
-     
+
     for (i = 0; i < NUM; i++) {
 	  if (std::abs(C1[i] - cpuC[i]) > eps) {
 		  errors++;
 	}
     }
-          
+
     if (errors != 0){
 	printf("\n Automatic Test FAILED: %d errors\n", errors);
     } else {
@@ -36,11 +36,11 @@ int main(){

    e = hipMemcpyFromSymbol(S, HIP_SYMBOL(Sd), SIZE, 0, hipMemcpyDeviceToHost);
    HIPASSERT(e==hipErrorInvalidSymbol);
-    
+
    e = hipMemcpyFromSymbol(S, NULL, SIZE, 0, hipMemcpyDeviceToHost);
    HIPASSERT(e==hipErrorInvalidSymbol);

-    HIPCHECK(hipFree(Sd)); 
-    
+    HIPCHECK(hipFree(Sd));
+
    passed();
 }
@@ -39,11 +39,11 @@ int main(){

    e = hipMemcpyFromSymbolAsync(S, HIP_SYMBOL(Sd), SIZE, 0, hipMemcpyDeviceToHost, stream);
    HIPASSERT(e==hipErrorInvalidSymbol);
-    
+
    e = hipMemcpyFromSymbolAsync(S, NULL, SIZE, 0, hipMemcpyDeviceToHost, stream);
    HIPASSERT(e==hipErrorInvalidSymbol);

-    HIPCHECK(hipFree(Sd)); 
-    
+    HIPCHECK(hipFree(Sd));
+
    passed();
 }
@@ -36,11 +36,11 @@ int main(){

    e = hipMemcpyToSymbol(HIP_SYMBOL(Sd), S, SIZE, 0, hipMemcpyHostToDevice);
    HIPASSERT(e==hipErrorInvalidSymbol);
-    
+
    e = hipMemcpyToSymbol(NULL, S, SIZE, 0, hipMemcpyHostToDevice);
    HIPASSERT(e==hipErrorInvalidSymbol);

-    HIPCHECK(hipFree(Sd)); 
-    
+    HIPCHECK(hipFree(Sd));
+
    passed();
 }
@@ -31,7 +31,7 @@ int main(){
    void *Sd;
    hipError_t e;
    char S[SIZE]="This is not a device symbol";
-    
+
    HIPCHECK(hipMalloc(&Sd,SIZE));

    hipStream_t stream;
@@ -39,11 +39,11 @@ int main(){

    e = hipMemcpyToSymbolAsync(HIP_SYMBOL(Sd), S, SIZE, 0, hipMemcpyHostToDevice, stream);
    HIPASSERT(e==hipErrorInvalidSymbol);
-    
+
    e = hipMemcpyToSymbolAsync(NULL, S, SIZE, 0, hipMemcpyHostToDevice, stream);
    HIPASSERT(e==hipErrorInvalidSymbol);
-   
-    HIPCHECK(hipFree(Sd)); 
-    
+
+    HIPCHECK(hipFree(Sd));
+
    passed();
 }
@@ -1,4 +1,4 @@
-/* 
+/*
 Copyright (c) 2015 - 2021 Advanced Micro Devices, Inc. All rights reserved.
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
@@ -32,7 +32,7 @@ int main(){

   e = hipMemcpy(0, str, SIZE, hipMemcpyHostToDevice);
   HIPASSERT(e==hipErrorInvalidValue);
-   
+
   e = hipMemcpy(NULL, str, SIZE, hipMemcpyHostToDevice);
   HIPASSERT(e==hipErrorInvalidValue);

@@ -45,7 +45,7 @@ void HipClassTests::TestForOverride(void){
                     0,
                     0,
                     result_ecd);
-  
+
  HipClassTests::VerifyResult(result_ech,result_ecd);
  HipClassTests::FreeMem(result_ech,result_ecd);
 }
@@ -70,13 +70,13 @@ void HipClassTests::TestForOverload(void){
                     0,
                     0,
                     result_ecd);
-  
+
  HipClassTests::VerifyResult(result_ech,result_ecd);
  HipClassTests::FreeMem(result_ech,result_ecd);
 }
 #endif

-#ifdef ENABLE_FRIEND_TEST 
+#ifdef ENABLE_FRIEND_TEST
 // check for friend
 __global__ void
 friendClassKernel(bool* result_ecd){
@@ -106,7 +106,7 @@ void HipClassTests::TestForEmptyClass(void){
                     0,
                     0,
                     result_ecd);
-  
+
  HipClassTests::VerifyResult(result_ech,result_ecd);
  HipClassTests::FreeMem(result_ech,result_ecd);
 }
@@ -157,7 +157,7 @@ void HipClassTests::TestForClassSize(void){
                      0,
                      0,
                      result_ecd);
-  
+
  HipClassTests::VerifyResult(result_ech,result_ecd);
  HipClassTests::FreeMem(result_ech,result_ecd);
 }
@@ -217,7 +217,7 @@ void HipClassTests::TestForPassByValue(void){
  HipClassTests::VerifyResult(result_ech,result_ecd);
  HipClassTests::FreeMem(result_ech,result_ecd);
 }
- 
+
 // check obj created with hipMalloc
 __global__ void
 mallocObjKernel(testPassByValue *obj, bool* result_ecd) {
@@ -292,7 +292,7 @@ bool* HipClassTests::AllocateHostMemory(void){
 }

 bool* HipClassTests::AllocateDeviceMemory(void){
-  bool* result_ecd; 
+  bool* result_ecd;
  HIPCHECK(hipMalloc(&result_ecd,
                     NBOOL));
  HIPCHECK(hipMemset(result_ecd,
@@ -351,5 +351,5 @@ int main(){
 #ifdef ENABLE_DESTRUCTOR_TEST
  classTests.TestForConsrtDesrt();
  test_passed(TestForConsrtDesrt);
-#endif 
+#endif
 }
@@ -55,17 +55,17 @@ __host__ __device__ void testOperations(float &fa, float &fb) {
  hip_bfloat16 bf_a(fa);
  hip_bfloat16 bf_b(fb);
  float fc = float(bf_a);
-  float fd = float(bf_b); 
+  float fd = float(bf_b);

  assert(testRelativeAccuracy(fa, bf_a));
  assert(testRelativeAccuracy(fb, bf_b));

  assert(testRelativeAccuracy(fc + fd, bf_a + bf_b));
-  //when checked as above for add, operation sub fails on GPU 
+  //when checked as above for add, operation sub fails on GPU
  assert(hip_bfloat16(fc - fd) == (bf_a - bf_b));
  assert(testRelativeAccuracy(fc * fd, bf_a * bf_b));
  assert(testRelativeAccuracy(fc / fd, bf_a / bf_b));
- 
+
  hip_bfloat16 bf_opNegate = -bf_a;
  assert(bf_opNegate == -bf_a);

@@ -75,7 +75,7 @@ __host__ __device__ void testOperations(float &fa, float &fb) {
  bf_x--;
  ++bf_x;
  --bf_x;
-  //hip_bfloat16 is converted to float and then inc/decremented, hence check with reduced precision 
+  //hip_bfloat16 is converted to float and then inc/decremented, hence check with reduced precision
  assert(testRelativeAccuracy(bf_x,bf_a));

  bf_x = bf_a;
@@ -95,7 +95,7 @@ __host__ __device__ void testOperations(float &fa, float &fb) {
  if (isnan(bf_rounded)) {
    assert(isnan(bf_rounded) || isinf(bf_rounded));
  }
-}  
+}

 __global__ void testOperationsGPU(float* d_a, float* d_b)
 {
@@ -126,7 +126,7 @@ int main(){

  hipLaunchKernelGGL(testOperationsGPU, 1, SIZE, 0, 0, d_fa, d_fb);
  hipDeviceSynchronize();
-  cout<<"Device bfloat16 Operations Successful!!"<<endl; 
+  cout<<"Device bfloat16 Operations Successful!!"<<endl;

  delete[] h_fa;
  delete[] h_fb;
@@ -56,21 +56,21 @@ __global__ void kernel_lgamma_double(double *input, double *output) {
 void check_lgamma_double() {

  using datatype_t = double;
-  
+
  const int NUM_INPUTS = 8;
  auto memsize = NUM_INPUTS * sizeof(datatype_t);
-  
+
  // allocate memories
  datatype_t *inputCPU = (datatype_t *) malloc(memsize);
  datatype_t *outputCPU = (datatype_t *) malloc(memsize);
  datatype_t *inputGPU = nullptr; hipMalloc((void**)&inputGPU, memsize);
  datatype_t *outputGPU = nullptr; hipMalloc((void**)&outputGPU, memsize);
-  
+
  // populate input
  for (int i=0; i<NUM_INPUTS; i++) {
    inputCPU[i] = -3.5 + i;
  }
-  
+
  // copy inputs to device
  hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);

@@ -84,13 +84,13 @@ void check_lgamma_double() {
  for (int i=0; i<NUM_INPUTS; i++) {
    CHECK_LGAMMA_DOUBLE(inputCPU[i], outputCPU[i], lgamma(inputCPU[i]));
  }
-  
+
  // free memories
  hipFree(inputGPU);
  hipFree(outputGPU);
  free(inputCPU);
  free(outputCPU);
-  
+
  // done
  return;
 }
@@ -102,15 +102,15 @@ void check_abs_int64() {

  const int NUM_INPUTS = 8;
  auto memsize = NUM_INPUTS * sizeof(datatype_t);
-  
+
  // allocate memories
  datatype_t *inputCPU = (datatype_t *) malloc(memsize);
  datatype_t *outputCPU = (datatype_t *) malloc(memsize);
  datatype_t *inputGPU = nullptr; hipMalloc((void**)&inputGPU, memsize);
  datatype_t *outputGPU = nullptr; hipMalloc((void**)&outputGPU, memsize);
-  
+
  // populate input
-  inputCPU[0] = -81985529216486895ll; 
+  inputCPU[0] = -81985529216486895ll;
  inputCPU[1] =  81985529216486895ll;
  inputCPU[2] = -1250999896491ll;
  inputCPU[3] =  1250999896491ll;
@@ -118,7 +118,7 @@ void check_abs_int64() {
  inputCPU[5] =  19088743ll;
  inputCPU[6] = -291ll;
  inputCPU[7] =  291ll;
-  
+
  // copy inputs to device
  hipMemcpy(inputGPU, inputCPU, memsize, hipMemcpyHostToDevice);

@@ -137,17 +137,17 @@ void check_abs_int64() {
  CHECK_ABS_INT64(inputCPU[5], outputCPU[5], outputCPU[5]);
  CHECK_ABS_INT64(inputCPU[6], outputCPU[6], outputCPU[7]);
  CHECK_ABS_INT64(inputCPU[7], outputCPU[7], outputCPU[7]);
-  
+
  // free memories
  hipFree(inputGPU);
  hipFree(outputGPU);
  free(inputCPU);
  free(outputCPU);
-  
+
  // done
  return;
 }
-  
+

 template<class T, class F>
 __global__ void kernel_simple(F f, T *out) {
@@ -191,7 +191,7 @@ int main(int argc, char* argv[]) {
    check_abs_int64();

    // check_lgamma_double();
-    
+
    test_fp16();

    test_pown();
@@ -82,7 +82,7 @@ __device__ __host__ complex<FloatT> calc(complex<FloatT> A,
    return A * B;
  case CK_div:
    return A / B;
-    
+
    ONE_ARG(abs)
    ONE_ARG(arg)
    ONE_ARG(sin)
@@ -111,7 +111,7 @@ void test() {
    hipMalloc((void**)&Ad, sizeof(ComplexT)*LEN);
    hipMalloc((void**)&Bd, sizeof(ComplexT)*LEN);
    hipMalloc((void**)&Cd, sizeof(ComplexT)*LEN);
-    
+
    for (uint32_t i = 0; i < LEN; i++) {
        A[i] = ComplexT((i + 1) * 1.0f, (i + 2) * 1.0f);
        B[i] = A[i];
@@ -119,7 +119,7 @@ void test() {
    }
    hipMemcpy(Ad, A, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice);
    hipMemcpy(Bd, B, sizeof(ComplexT)*LEN, hipMemcpyHostToDevice);
-    
+
    // Run kernel for a calculation kind and verify by comparing with host
    // calculation result. Returns false if fails.
    auto test_fun = [&](enum CalcKind CK) {
@@ -145,7 +145,7 @@ void test() {
      }
      return true;
    };
-    
+
 #define OP(x) assert(test_fun(CK_##x));
    ALL_FUN
 #undef OP
@@ -84,7 +84,7 @@ void kernel_hisnan(__half* input, int* output) {
 }

 __global__
-void kernel_hisinf(__half* input, int* output) { 
+void kernel_hisinf(__half* input, int* output) {
  int tx = threadIdx.x;
  output[tx] = __hisinf(input[tx]);
 }
@@ -41,7 +41,7 @@ THE SOFTWARE.
  private:
      int a;
  };
-     
+
  static __global__ void kernel(int* Ad) {
      int tid = threadIdx.x + blockIdx.x * blockDim.x;
      new(Ad+tid) A();
@@ -41,7 +41,7 @@ int readHipEnvVar(string flags, char* buff){

    std::cout << "\nFinding hipEnvVar in " << directed_dir << "...\n";
    FILE* directed_in = popen((directed_dir + flags).c_str(), "r");
-    
+
    if(fgets(buff, 512, directed_in) == NULL){
        std::cout << "Finding hipEnvVar in " << dir << "...\n";
        FILE* in = popen((dir + flags).c_str(), "r");
@@ -74,7 +74,7 @@ int getDeviceNumber(bool print_err=true) {
 }

 // Query the current device ID remotely to hipEnvVar
-void getDevicePCIBusNumRemote(int deviceID, char* pciBusID) {    
+void getDevicePCIBusNumRemote(int deviceID, char* pciBusID) {
    std::this_thread::sleep_for(std::chrono::milliseconds(10));
    if (readHipEnvVar((" -d " + std::to_string(deviceID)), pciBusID)){
        std::cerr << "The system cannot find hipEnvVar\n";