Merge pull request #1406 from emankov/master

[HIPIFY][#1400] Fix Template Instantiation kernel launch (clang & perl)
2019-09-11 15:49:46 +03:00
@@ -1662,18 +1662,37 @@ while (@ARGV) {
            # match uses ? for <.*> which will be unitialized if this is not present in launch syntax.
            no warnings qw/uninitialized/;
            my $k = 0;
-            # Handle the <<numBlocks, blockDim, sharedSize, stream>>> syntax with empty args:
-            $k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, $6)/g;
-            # Handle the <<numBlocks, blockDim, sharedSize, stream>>> syntax with non-empty args:
-            $k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, $6, /g;
-            # Handle the <<numBlocks, blockDim, sharedSize>>> syntax with empty args:
-            $k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, 0)/g;
-            # Handle the <<numBlocks, blockDim, sharedSize>>> syntax with non-empty args:
-            $k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, 0, /g;
-            # Handle the <<numBlocks, blockDim>>> syntax with empty args:
-            $k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), 0, 0)/g;
-            # Handle the <<numBlocks, blockDim>>> syntax with non-empty args:
-            $k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), 0, 0, /g;
+
+            # Handle the kern<...><<<numBlocks, blockDim, sharedSize, stream>>> syntax with empty args:
+            $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, $6)/g;
+            # Handle the kern<<<numBlocks, blockDim, sharedSize, stream>>> syntax with empty args:
+            $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, $5)/g;
+
+            # Handle the kern<...><<<numBlocks, blockDim, sharedSize, stream>>> syntax with non-empty args:
+            $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, $6, /g;
+            # Handle the kern<<<numBlocks, blockDim, sharedSize, stream>>> syntax with non-empty args:
+            $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, $5, /g;
+
+            # Handle the kern<...><<<numBlocks, blockDim, sharedSize>>> syntax with empty args:
+            $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, 0)/g;
+            # Handle the kern<<<numBlocks, blockDim, sharedSize>>> syntax with empty args:
+            $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, 0)/g;
+
+            # Handle the kern<...><<<numBlocks, blockDim, sharedSize>>> syntax with non-empty args:
+            $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, 0, /g;
+            # Handle the kern<<<numBlocks, blockDim, sharedSize>>> syntax with non-empty args:
+            $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, 0, /g;
+
+            # Handle the kern<...><<<numBlocks, blockDim>>> syntax with empty args:
+            $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), 0, 0)/g;
+            # Handle the kern<<<numBlocks, blockDim>>> syntax with empty args:
+            $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), 0, 0)/g;
+
+            # Handle the kern<...><<<numBlocks, blockDim>>> syntax with non-empty args:
+            $k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), 0, 0, /g;
+            # Handle the kern<<<numBlocks, blockDim>>> syntax with non-empty args:
+            $k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), 0, 0, /g;
+
            if ($k) {
                $ft{'kern_launch'} += $k;
                $Tkernels{$1} ++;
@@ -289,24 +289,35 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc
  if (!launchKernel) {
    return false;
  }
+  const clang::Expr* calleeExpr = launchKernel->getCallee();
+  if (!calleeExpr) {
+    return false;
+  }
+  const clang::FunctionDecl *caleeDecl = launchKernel->getDirectCallee();
+  if (!caleeDecl) {
+    return false;
+  }
+  const clang::CallExpr* config = launchKernel->getConfig();
+  if (!config) {
+    return false;
+  }
  clang::SmallString<40> XStr;
  llvm::raw_svector_ostream OS(XStr);
  clang::LangOptions DefaultLangOptions;
  clang::SourceManager* SM = Result.SourceManager;
-
-  const clang::Expr& calleeExpr = *(launchKernel->getCallee());
-  OS << "hipLaunchKernelGGL(" << readSourceText(*SM, calleeExpr.getSourceRange()) << ", ";
+  OS << "hipLaunchKernelGGL(";
+  if (caleeDecl->isTemplateInstantiation()) OS << "(";
+  OS << readSourceText(*SM, calleeExpr->getSourceRange());
+  if (caleeDecl->isTemplateInstantiation()) OS << ")";
+  OS << ", ";

  // Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
-  const clang::CallExpr& config = *(launchKernel->getConfig());
-
  // Copy the two dimensional arguments verbatim.
-  OS << "dim3(" << readSourceText(*SM, config.getArg(0)->getSourceRange()) << "), ";
-  OS << "dim3(" << readSourceText(*SM, config.getArg(1)->getSourceRange()) << "), ";
-
+  OS << "dim3(" << readSourceText(*SM, config->getArg(0)->getSourceRange()) << "), ";
+  OS << "dim3(" << readSourceText(*SM, config->getArg(1)->getSourceRange()) << "), ";
  // The stream/memory arguments default to zero if omitted.
-  OS << stringifyZeroDefaultedArg(*SM, config.getArg(2)) << ", ";
-  OS << stringifyZeroDefaultedArg(*SM, config.getArg(3));
+  OS << stringifyZeroDefaultedArg(*SM, config->getArg(2)) << ", ";
+  OS << stringifyZeroDefaultedArg(*SM, config->getArg(3));

  // If there are ordinary arguments to the kernel, just copy them verbatim into our new call.
  int numArgs = launchKernel->getNumArgs();
@@ -121,7 +121,7 @@ struct runner
        const size_t states_size = blocks * threads;
        // CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(GeneratorState)));
        CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(GeneratorState)));
-        // CHECK: hipLaunchKernelGGL(init_kernel, dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
+        // CHECK: hipLaunchKernelGGL((init_kernel), dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
        init_kernel<<<blocks, threads>>>(states, seed, offset);
        // CHECK: CUDA_CALL(hipPeekAtLastError());
        // CHECK: CUDA_CALL(hipDeviceSynchronize());
@@ -142,7 +142,7 @@ struct runner
                  const GenerateFunc& generate_func,
                  const Extra extra)
    {
-        // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
+        // CHECK: hipLaunchKernelGGL((generate_kernel), dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
        generate_kernel<<<blocks, threads>>>(states, data, size, generate_func, extra);
    }
 };
@@ -223,7 +223,7 @@ struct runner<curandStateMtgp32_t>
                  const GenerateFunc& generate_func,
                  const Extra extra)
    {
-        // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
+        // CHECK: hipLaunchKernelGGL((generate_kernel), dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
        generate_kernel<<<std::min((size_t)200, blocks), 256>>>(states, data, size, generate_func, extra);
    }
 };
@@ -304,7 +304,7 @@ struct runner<curandStateSobol32_t>
        CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));

        const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        // CHECK: hipLaunchKernelGGL(init_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset);
+        // CHECK: hipLaunchKernelGGL((init_kernel), dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset);
        init_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
        // CHECK: CUDA_CALL(hipPeekAtLastError());
        // CHECK: CUDA_CALL(hipDeviceSynchronize());
@@ -329,7 +329,7 @@ struct runner<curandStateSobol32_t>
                  const Extra extra)
    {
        const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
-        // CHECK: hipLaunchKernelGGL(generate_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
+        // CHECK: hipLaunchKernelGGL((generate_kernel), dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
        generate_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, data, size / dimensions, generate_func, extra);
    }
 };
@@ -10,7 +10,7 @@
 #define KERNEL_CALL_AS_MACRO axpy<float><<<1, kDataLen>>>
 #define KERNEL_NAME_MACRO axpy<float>

-// CHECK: #define COMPLETE_LAUNCH hipLaunchKernelGGL(axpy, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y)
+// CHECK: #define COMPLETE_LAUNCH hipLaunchKernelGGL((axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y)
 #define COMPLETE_LAUNCH axpy<<<1, kDataLen>>>(a, device_x, device_y)


@@ -48,22 +48,22 @@ int main(int argc, char* argv[]) {
  cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);

  // Launch the kernel in numerous different strange ways to exercise the prerocessor.
-  // CHECK: hipLaunchKernelGGL(axpy, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
+  // CHECK: hipLaunchKernelGGL((axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
  axpy<<<1, kDataLen>>>(a, device_x, device_y);

-  // CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
+  // CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
  axpy<float><<<1, kDataLen>>>(a, device_x, device_y);

-  // CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, a, TOKEN_PASTE(device, _x), device_y);
+  // CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, a, TOKEN_PASTE(device, _x), device_y);
  axpy<float><<<1, kDataLen>>>(a, TOKEN_PASTE(device, _x), device_y);

-  // CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
+  // CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
  axpy<float><<<1, kDataLen>>>(ARG_LIST_AS_MACRO);

-  // CHECK: hipLaunchKernelGGL(KERNEL_NAME_MACRO, dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
+  // CHECK: hipLaunchKernelGGL((KERNEL_NAME_MACRO), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
  KERNEL_NAME_MACRO<<<1, kDataLen>>>(ARG_LIST_AS_MACRO);

-  // CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
+  // CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
  KERNEL_CALL_AS_MACRO(ARG_LIST_AS_MACRO);

  // CHECK: hipLaunchKernelGGL(empty, dim3(1), dim3(kDataLen), 0, 0);
@@ -94,7 +94,7 @@ int main(int argc, char *argv[])
    const unsigned threadsPerBlock = 256;

    printf ("info: launch 'vector_square' kernel\n");
-    // CHECK: hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
+    // CHECK: hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
    vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);

    printf ("info: copy Device2Host\n");