[HIPIFY] Return to wrapping with HIP_KERNEL_NAME(...) macro of a template instantiation kernel launch

[REASON] 1. hip-clang is fine with the templated kernel launch, brackets are unneeded: HIP_KERNEL_NAME(...) __VA_ARGS__ 2. HCC is not, thus: HIP_KERNEL_NAME(...) (__VA_ARGS__) [TODO] Clean-up entirely kernel name wrapping when HCC is finally obsolete. + Update perl generation, hipify-perl, and affected tests accordingly.
2019-10-02 16:01:07 +03:00
commit 6bb9913e8a
@@ -1610,32 +1610,32 @@ sub transformKernelLaunch {
    my $k = 0;

    # Handle the kern<...><<<Dg, Db, Ns, S>>>() syntax with empty args:
-    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, $6)/g;
+    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), $5, $6)/g;
    # Handle the kern<<<Dg, Db, Ns, S>>>() syntax with empty args:
    $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, $5)/g;

    # Handle the kern<...><<<Dg, Db, Ns, S>>>(...) syntax with non-empty args:
-    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, $6, /g;
+    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), $5, $6, /g;
    # Handle the kern<<<Dg, Db, Ns, S>>>(...) syntax with non-empty args:
    $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, $5, /g;

    # Handle the kern<...><<<Dg, Db, Ns>>>() syntax with empty args:
-    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, 0)/g;
+    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), $5, 0)/g;
    # Handle the kern<<<Dg, Db, Ns>>>() syntax with empty args:
    $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, 0)/g;

    # Handle the kern<...><<Dg, Db, Ns>>>(...) syntax with non-empty args:
-    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, 0, /g;
+    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), $5, 0, /g;
    # Handle the kern<<<Dg, Db, Ns>>>(...) syntax with non-empty args:
    $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, 0, /g;

    # Handle the kern<...><<<Dg, Db>>>() syntax with empty args:
-    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), 0, 0)/g;
+    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), 0, 0)/g;
    # Handle the kern<<<Dg, Db>>>() syntax with empty args:
    $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), 0, 0)/g;

    # Handle the kern<...><<<Dg, Db>>>(...) syntax with non-empty args:
-    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), 0, 0, /g;
+    $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), 0, 0, /g;
    # Handle the kern<<<Dg, Db>>>(...) syntax with non-empty args:
    $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), 0, 0, /g;

@@ -197,32 +197,32 @@ namespace perl {
    *streamPtr.get() << tab << sMy_k << std::endl << std::endl;

    *streamPtr.get() << tab << "# Handle the kern<...><<<Dg, Db, Ns, S>>>() syntax with empty args:" << std::endl;
-    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\((\\s*)\\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, $6)/g;" << std::endl;
+    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\((\\s*)\\)/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), $5, $6)/g;" << std::endl;
    *streamPtr.get() << tab << "# Handle the kern<<<Dg, Db, Ns, S>>>() syntax with empty args:" << std::endl;
    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\((\\s*)\\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, $5)/g;" << std::endl << std::endl;

    *streamPtr.get() << tab << "# Handle the kern<...><<<Dg, Db, Ns, S>>>(...) syntax with non-empty args:" << std::endl;
-    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, $6, /g;" << std::endl;
+    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\(/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), $5, $6, /g;" << std::endl;
    *streamPtr.get() << tab << "# Handle the kern<<<Dg, Db, Ns, S>>>(...) syntax with non-empty args:" << std::endl;
    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, $5, /g;" << std::endl << std::endl;

    *streamPtr.get() << tab << "# Handle the kern<...><<<Dg, Db, Ns>>>() syntax with empty args:" << std::endl;
-    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\((\\s*)\\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, 0)/g;" << std::endl;
+    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\((\\s*)\\)/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), $5, 0)/g;" << std::endl;
    *streamPtr.get() << tab << "# Handle the kern<<<Dg, Db, Ns>>>() syntax with empty args:" << std::endl;
    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\((\\s*)\\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, 0)/g;" << std::endl << std::endl;

    *streamPtr.get() << tab << "# Handle the kern<...><<Dg, Db, Ns>>>(...) syntax with non-empty args:" << std::endl;
-    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, 0, /g;" << std::endl;
+    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\(/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), $5, 0, /g;" << std::endl;
    *streamPtr.get() << tab << "# Handle the kern<<<Dg, Db, Ns>>>(...) syntax with non-empty args:" << std::endl;
    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, 0, /g;" << std::endl << std::endl;

    *streamPtr.get() << tab << "# Handle the kern<...><<<Dg, Db>>>() syntax with empty args:" << std::endl;
-    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\((\\s*)\\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), 0, 0)/g;" << std::endl;
+    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\((\\s*)\\)/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), 0, 0)/g;" << std::endl;
    *streamPtr.get() << tab << "# Handle the kern<<<Dg, Db>>>() syntax with empty args:" << std::endl;
    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\((\\s*)\\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), 0, 0)/g;" << std::endl << std::endl;

    *streamPtr.get() << tab << "# Handle the kern<...><<<Dg, Db>>>(...) syntax with non-empty args:" << std::endl;
-    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), 0, 0, /g;" << std::endl;
+    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<(.+)>\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\(/hipLaunchKernelGGL(HIP_KERNEL_NAME($1<$2>), dim3($3), dim3($4), 0, 0, /g;" << std::endl;
    *streamPtr.get() << tab << "# Handle the kern<<<Dg, Db>>>(...) syntax with non-empty args:" << std::endl;
    *streamPtr.get() << tab << "$k += s/(\\w+)\\s*<<<\\s*(.+)\\s*,\\s*(.+)\\s*>>>(\\s*)\\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), 0, 0, /g;" << std::endl << std::endl;

@@ -38,6 +38,7 @@ namespace mat = clang::ast_matchers;

 const std::string sHIP_DYNAMIC_SHARED = "HIP_DYNAMIC_SHARED";
 std::string sHIP_SYMBOL = "HIP_SYMBOL";
+std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME";
 std::string s_reinterpret_cast = "reinterpret_cast<const void*>";
 const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL(";
 const std::string sDim3 = "dim3(";
@@ -348,7 +349,7 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc
  clang::LangOptions DefaultLangOptions;
  clang::SourceManager* SM = Result.SourceManager;
  OS << sHipLaunchKernelGGL;
-  if (caleeDecl->isTemplateInstantiation()) OS << "(";
+  if (caleeDecl->isTemplateInstantiation()) OS << sHIP_KERNEL_NAME << "(";
  OS << readSourceText(*SM, calleeExpr->getSourceRange());
  if (caleeDecl->isTemplateInstantiation()) OS << ")";
  OS << ", ";
@@ -257,7 +257,7 @@ void runTest() {
    // CHECK: hipMemcpy(dOData, hOData, memSize, hipMemcpyHostToDevice);
    cudaMemcpy(dOData, hOData, memSize, cudaMemcpyHostToDevice);
    // Execute the kernel
-    // CHECK: hipLaunchKernelGGL((testKernel), dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
+    // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(testKernel), dim3(numBlocks), dim3(numThreads), 0, 0, dOData);
    testKernel<<<numBlocks, numThreads>>>(dOData);
    // Copy result from device to host
    // CHECK: hipMemcpy(hOData, dOData, memSize, hipMemcpyDeviceToHost);
@@ -121,7 +121,7 @@ struct runner
        const size_t states_size = blocks * threads;
        // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(GeneratorState)));
        CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(GeneratorState)));
-        // CHECK: hipLaunchKernelGGL((init_kernel), dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
+        // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
        init_kernel<<<blocks, threads>>>(states, seed, offset);
        // CHECK: CUDA_CALL(hipPeekAtLastError());
        // CHECK: CUDA_CALL(hipDeviceSynchronize());
@@ -142,7 +142,7 @@ struct runner
                  const GenerateFunc& generate_func,
                  const Extra extra)
    {
-        // CHECK: hipLaunchKernelGGL((generate_kernel), dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
+        // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
        generate_kernel<<<blocks, threads>>>(states, data, size, generate_func, extra);
    }
 };
@@ -223,7 +223,7 @@ struct runner<curandStateMtgp32_t>
                  const GenerateFunc& generate_func,
                  const Extra extra)
    {
-        // CHECK: hipLaunchKernelGGL((generate_kernel), dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
+        // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
        generate_kernel<<<std::min((size_t)200, blocks), 256>>>(states, data, size, generate_func, extra);
    }
 };
@@ -304,7 +304,7 @@ struct runner<curandStateSobol32_t>
        CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));

        const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        // CHECK: hipLaunchKernelGGL((init_kernel), dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, directions, offset);
+        // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(init_kernel), dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, directions, offset);
        init_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
        // CHECK: CUDA_CALL(hipPeekAtLastError());
        // CHECK: CUDA_CALL(hipDeviceSynchronize());
@@ -329,7 +329,7 @@ struct runner<curandStateSobol32_t>
                  const Extra extra)
    {
        const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        // CHECK: hipLaunchKernelGGL((generate_kernel), dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
+        // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(generate_kernel), dim3(blocks_x, dimensions), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
        generate_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, data, size / dimensions, generate_func, extra);
    }
 };
@@ -10,7 +10,7 @@
 #define KERNEL_CALL_AS_MACRO axpy<float><<<1, kDataLen>>>
 #define KERNEL_NAME_MACRO axpy<float>

-// CHECK: #define COMPLETE_LAUNCH hipLaunchKernelGGL((axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y)
+// CHECK: #define COMPLETE_LAUNCH hipLaunchKernelGGL(HIP_KERNEL_NAME(axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y)
 #define COMPLETE_LAUNCH axpy<<<1, kDataLen>>>(a, device_x, device_y)


@@ -48,22 +48,22 @@ int main(int argc, char* argv[]) {
  cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);

  // Launch the kernel in numerous different strange ways to exercise the prerocessor.
-  // CHECK: hipLaunchKernelGGL((axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
+  // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
  axpy<<<1, kDataLen>>>(a, device_x, device_y);

-  // CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
+  // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(axpy<float>), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
  axpy<float><<<1, kDataLen>>>(a, device_x, device_y);

-  // CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, a, TOKEN_PASTE(device, _x), device_y);
+  // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(axpy<float>), dim3(1), dim3(kDataLen), 0, 0, a, TOKEN_PASTE(device, _x), device_y);
  axpy<float><<<1, kDataLen>>>(a, TOKEN_PASTE(device, _x), device_y);

-  // CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
+  // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(axpy<float>), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
  axpy<float><<<1, kDataLen>>>(ARG_LIST_AS_MACRO);

-  // CHECK: hipLaunchKernelGGL((KERNEL_NAME_MACRO), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
+  // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(KERNEL_NAME_MACRO), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
  KERNEL_NAME_MACRO<<<1, kDataLen>>>(ARG_LIST_AS_MACRO);

-  // CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
+  // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(axpy<float>), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
  KERNEL_CALL_AS_MACRO(ARG_LIST_AS_MACRO);

  // CHECK: hipLaunchKernelGGL(empty, dim3(1), dim3(kDataLen), 0, 0);
@@ -94,7 +94,7 @@ int main(int argc, char *argv[])
    const unsigned threadsPerBlock = 256;

    printf ("info: launch 'vector_square' kernel\n");
-    // CHECK: hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
    vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);

    printf ("info: copy Device2Host\n");