Merge pull request #1406 from emankov/master
[HIPIFY][#1400] Fix Template Instantiation kernel launch (clang & perl)
Этот коммит содержится в:
+31
-12
@@ -1662,18 +1662,37 @@ while (@ARGV) {
|
||||
# match uses ? for <.*> which will be unitialized if this is not present in launch syntax.
|
||||
no warnings qw/uninitialized/;
|
||||
my $k = 0;
|
||||
# Handle the <<numBlocks, blockDim, sharedSize, stream>>> syntax with empty args:
|
||||
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, $6)/g;
|
||||
# Handle the <<numBlocks, blockDim, sharedSize, stream>>> syntax with non-empty args:
|
||||
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, $6, /g;
|
||||
# Handle the <<numBlocks, blockDim, sharedSize>>> syntax with empty args:
|
||||
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, 0)/g;
|
||||
# Handle the <<numBlocks, blockDim, sharedSize>>> syntax with non-empty args:
|
||||
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, 0, /g;
|
||||
# Handle the <<numBlocks, blockDim>>> syntax with empty args:
|
||||
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), 0, 0)/g;
|
||||
# Handle the <<numBlocks, blockDim>>> syntax with non-empty args:
|
||||
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), 0, 0, /g;
|
||||
|
||||
# Handle the kern<...><<<numBlocks, blockDim, sharedSize, stream>>> syntax with empty args:
|
||||
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, $6)/g;
|
||||
# Handle the kern<<<numBlocks, blockDim, sharedSize, stream>>> syntax with empty args:
|
||||
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, $5)/g;
|
||||
|
||||
# Handle the kern<...><<<numBlocks, blockDim, sharedSize, stream>>> syntax with non-empty args:
|
||||
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, $6, /g;
|
||||
# Handle the kern<<<numBlocks, blockDim, sharedSize, stream>>> syntax with non-empty args:
|
||||
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, $5, /g;
|
||||
|
||||
# Handle the kern<...><<<numBlocks, blockDim, sharedSize>>> syntax with empty args:
|
||||
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, 0)/g;
|
||||
# Handle the kern<<<numBlocks, blockDim, sharedSize>>> syntax with empty args:
|
||||
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, 0)/g;
|
||||
|
||||
# Handle the kern<...><<<numBlocks, blockDim, sharedSize>>> syntax with non-empty args:
|
||||
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, 0, /g;
|
||||
# Handle the kern<<<numBlocks, blockDim, sharedSize>>> syntax with non-empty args:
|
||||
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, 0, /g;
|
||||
|
||||
# Handle the kern<...><<<numBlocks, blockDim>>> syntax with empty args:
|
||||
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), 0, 0)/g;
|
||||
# Handle the kern<<<numBlocks, blockDim>>> syntax with empty args:
|
||||
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), 0, 0)/g;
|
||||
|
||||
# Handle the kern<...><<<numBlocks, blockDim>>> syntax with non-empty args:
|
||||
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), 0, 0, /g;
|
||||
# Handle the kern<<<numBlocks, blockDim>>> syntax with non-empty args:
|
||||
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), 0, 0, /g;
|
||||
|
||||
if ($k) {
|
||||
$ft{'kern_launch'} += $k;
|
||||
$Tkernels{$1} ++;
|
||||
|
||||
@@ -289,24 +289,35 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc
|
||||
if (!launchKernel) {
|
||||
return false;
|
||||
}
|
||||
const clang::Expr* calleeExpr = launchKernel->getCallee();
|
||||
if (!calleeExpr) {
|
||||
return false;
|
||||
}
|
||||
const clang::FunctionDecl *caleeDecl = launchKernel->getDirectCallee();
|
||||
if (!caleeDecl) {
|
||||
return false;
|
||||
}
|
||||
const clang::CallExpr* config = launchKernel->getConfig();
|
||||
if (!config) {
|
||||
return false;
|
||||
}
|
||||
clang::SmallString<40> XStr;
|
||||
llvm::raw_svector_ostream OS(XStr);
|
||||
clang::LangOptions DefaultLangOptions;
|
||||
clang::SourceManager* SM = Result.SourceManager;
|
||||
|
||||
const clang::Expr& calleeExpr = *(launchKernel->getCallee());
|
||||
OS << "hipLaunchKernelGGL(" << readSourceText(*SM, calleeExpr.getSourceRange()) << ", ";
|
||||
OS << "hipLaunchKernelGGL(";
|
||||
if (caleeDecl->isTemplateInstantiation()) OS << "(";
|
||||
OS << readSourceText(*SM, calleeExpr->getSourceRange());
|
||||
if (caleeDecl->isTemplateInstantiation()) OS << ")";
|
||||
OS << ", ";
|
||||
|
||||
// Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
|
||||
const clang::CallExpr& config = *(launchKernel->getConfig());
|
||||
|
||||
// Copy the two dimensional arguments verbatim.
|
||||
OS << "dim3(" << readSourceText(*SM, config.getArg(0)->getSourceRange()) << "), ";
|
||||
OS << "dim3(" << readSourceText(*SM, config.getArg(1)->getSourceRange()) << "), ";
|
||||
|
||||
OS << "dim3(" << readSourceText(*SM, config->getArg(0)->getSourceRange()) << "), ";
|
||||
OS << "dim3(" << readSourceText(*SM, config->getArg(1)->getSourceRange()) << "), ";
|
||||
// The stream/memory arguments default to zero if omitted.
|
||||
OS << stringifyZeroDefaultedArg(*SM, config.getArg(2)) << ", ";
|
||||
OS << stringifyZeroDefaultedArg(*SM, config.getArg(3));
|
||||
OS << stringifyZeroDefaultedArg(*SM, config->getArg(2)) << ", ";
|
||||
OS << stringifyZeroDefaultedArg(*SM, config->getArg(3));
|
||||
|
||||
// If there are ordinary arguments to the kernel, just copy them verbatim into our new call.
|
||||
int numArgs = launchKernel->getNumArgs();
|
||||
|
||||
@@ -121,7 +121,7 @@ struct runner
|
||||
const size_t states_size = blocks * threads;
|
||||
// CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(GeneratorState)));
|
||||
CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(GeneratorState)));
|
||||
// CHECK: hipLaunchKernelGGL(init_kernel, dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
|
||||
// CHECK: hipLaunchKernelGGL((init_kernel), dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
|
||||
init_kernel<<<blocks, threads>>>(states, seed, offset);
|
||||
// CHECK: CUDA_CALL(hipPeekAtLastError());
|
||||
// CHECK: CUDA_CALL(hipDeviceSynchronize());
|
||||
@@ -142,7 +142,7 @@ struct runner
|
||||
const GenerateFunc& generate_func,
|
||||
const Extra extra)
|
||||
{
|
||||
// CHECK: hipLaunchKernelGGL(generate_kernel, dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
|
||||
// CHECK: hipLaunchKernelGGL((generate_kernel), dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
|
||||
generate_kernel<<<blocks, threads>>>(states, data, size, generate_func, extra);
|
||||
}
|
||||
};
|
||||
@@ -223,7 +223,7 @@ struct runner<curandStateMtgp32_t>
|
||||
const GenerateFunc& generate_func,
|
||||
const Extra extra)
|
||||
{
|
||||
// CHECK: hipLaunchKernelGGL(generate_kernel, dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
|
||||
// CHECK: hipLaunchKernelGGL((generate_kernel), dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
|
||||
generate_kernel<<<std::min((size_t)200, blocks), 256>>>(states, data, size, generate_func, extra);
|
||||
}
|
||||
};
|
||||
@@ -304,7 +304,7 @@ struct runner<curandStateSobol32_t>
|
||||
CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
|
||||
|
||||
const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
|
||||
// CHECK: hipLaunchKernelGGL(init_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset);
|
||||
// CHECK: hipLaunchKernelGGL((init_kernel), dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset);
|
||||
init_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
|
||||
// CHECK: CUDA_CALL(hipPeekAtLastError());
|
||||
// CHECK: CUDA_CALL(hipDeviceSynchronize());
|
||||
@@ -329,7 +329,7 @@ struct runner<curandStateSobol32_t>
|
||||
const Extra extra)
|
||||
{
|
||||
const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
|
||||
// CHECK: hipLaunchKernelGGL(generate_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
|
||||
// CHECK: hipLaunchKernelGGL((generate_kernel), dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
|
||||
generate_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, data, size / dimensions, generate_func, extra);
|
||||
}
|
||||
};
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
#define KERNEL_CALL_AS_MACRO axpy<float><<<1, kDataLen>>>
|
||||
#define KERNEL_NAME_MACRO axpy<float>
|
||||
|
||||
// CHECK: #define COMPLETE_LAUNCH hipLaunchKernelGGL(axpy, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y)
|
||||
// CHECK: #define COMPLETE_LAUNCH hipLaunchKernelGGL((axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y)
|
||||
#define COMPLETE_LAUNCH axpy<<<1, kDataLen>>>(a, device_x, device_y)
|
||||
|
||||
|
||||
@@ -48,22 +48,22 @@ int main(int argc, char* argv[]) {
|
||||
cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);
|
||||
|
||||
// Launch the kernel in numerous different strange ways to exercise the prerocessor.
|
||||
// CHECK: hipLaunchKernelGGL(axpy, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
|
||||
// CHECK: hipLaunchKernelGGL((axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
|
||||
axpy<<<1, kDataLen>>>(a, device_x, device_y);
|
||||
|
||||
// CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
|
||||
// CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
|
||||
axpy<float><<<1, kDataLen>>>(a, device_x, device_y);
|
||||
|
||||
// CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, a, TOKEN_PASTE(device, _x), device_y);
|
||||
// CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, a, TOKEN_PASTE(device, _x), device_y);
|
||||
axpy<float><<<1, kDataLen>>>(a, TOKEN_PASTE(device, _x), device_y);
|
||||
|
||||
// CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
|
||||
// CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
|
||||
axpy<float><<<1, kDataLen>>>(ARG_LIST_AS_MACRO);
|
||||
|
||||
// CHECK: hipLaunchKernelGGL(KERNEL_NAME_MACRO, dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
|
||||
// CHECK: hipLaunchKernelGGL((KERNEL_NAME_MACRO), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
|
||||
KERNEL_NAME_MACRO<<<1, kDataLen>>>(ARG_LIST_AS_MACRO);
|
||||
|
||||
// CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
|
||||
// CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
|
||||
KERNEL_CALL_AS_MACRO(ARG_LIST_AS_MACRO);
|
||||
|
||||
// CHECK: hipLaunchKernelGGL(empty, dim3(1), dim3(kDataLen), 0, 0);
|
||||
|
||||
@@ -94,7 +94,7 @@ int main(int argc, char *argv[])
|
||||
const unsigned threadsPerBlock = 256;
|
||||
|
||||
printf ("info: launch 'vector_square' kernel\n");
|
||||
// CHECK: hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
|
||||
// CHECK: hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
|
||||
vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
|
||||
|
||||
printf ("info: copy Device2Host\n");
|
||||
|
||||
Ссылка в новой задаче
Block a user