Merge pull request #1406 from emankov/master

[HIPIFY][#1400] Fix Template Instantiation kernel launch (clang & perl)
Этот коммит содержится в:
Evgeny Mankov
2019-09-11 15:49:46 +03:00
коммит произвёл GitHub
родитель 90acfb809c 56ab105e9d
Коммит ccf8ffc0c9
5 изменённых файлов: 65 добавлений и 35 удалений
+31 -12
Просмотреть файл
@@ -1662,18 +1662,37 @@ while (@ARGV) {
# match uses ? for <.*> which will be unitialized if this is not present in launch syntax.
no warnings qw/uninitialized/;
my $k = 0;
# Handle the <<numBlocks, blockDim, sharedSize, stream>>> syntax with empty args:
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, $6)/g;
# Handle the <<numBlocks, blockDim, sharedSize, stream>>> syntax with non-empty args:
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, $6, /g;
# Handle the <<numBlocks, blockDim, sharedSize>>> syntax with empty args:
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, 0)/g;
# Handle the <<numBlocks, blockDim, sharedSize>>> syntax with non-empty args:
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), $5, 0, /g;
# Handle the <<numBlocks, blockDim>>> syntax with empty args:
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), 0, 0)/g;
# Handle the <<numBlocks, blockDim>>> syntax with non-empty args:
$k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1$2, dim3($3), dim3($4), 0, 0, /g;
# Handle the kern<...><<<numBlocks, blockDim, sharedSize, stream>>> syntax with empty args:
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, $6)/g;
# Handle the kern<<<numBlocks, blockDim, sharedSize, stream>>> syntax with empty args:
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, $5)/g;
# Handle the kern<...><<<numBlocks, blockDim, sharedSize, stream>>> syntax with non-empty args:
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, $6, /g;
# Handle the kern<<<numBlocks, blockDim, sharedSize, stream>>> syntax with non-empty args:
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, $5, /g;
# Handle the kern<...><<<numBlocks, blockDim, sharedSize>>> syntax with empty args:
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, 0)/g;
# Handle the kern<<<numBlocks, blockDim, sharedSize>>> syntax with empty args:
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, 0)/g;
# Handle the kern<...><<<numBlocks, blockDim, sharedSize>>> syntax with non-empty args:
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), $5, 0, /g;
# Handle the kern<<<numBlocks, blockDim, sharedSize>>> syntax with non-empty args:
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), $4, 0, /g;
# Handle the kern<...><<<numBlocks, blockDim>>> syntax with empty args:
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), 0, 0)/g;
# Handle the kern<<<numBlocks, blockDim>>> syntax with empty args:
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\((\s*)\)/hipLaunchKernelGGL($1, dim3($2), dim3($3), 0, 0)/g;
# Handle the kern<...><<<numBlocks, blockDim>>> syntax with non-empty args:
$k += s/(\w+)\s*<(.+)>\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL(($1<$2>), dim3($3), dim3($4), 0, 0, /g;
# Handle the kern<<<numBlocks, blockDim>>> syntax with non-empty args:
$k += s/(\w+)\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>(\s*)\(/hipLaunchKernelGGL($1, dim3($2), dim3($3), 0, 0, /g;
if ($k) {
$ft{'kern_launch'} += $k;
$Tkernels{$1} ++;
+21 -10
Просмотреть файл
@@ -289,24 +289,35 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc
if (!launchKernel) {
return false;
}
const clang::Expr* calleeExpr = launchKernel->getCallee();
if (!calleeExpr) {
return false;
}
const clang::FunctionDecl *caleeDecl = launchKernel->getDirectCallee();
if (!caleeDecl) {
return false;
}
const clang::CallExpr* config = launchKernel->getConfig();
if (!config) {
return false;
}
clang::SmallString<40> XStr;
llvm::raw_svector_ostream OS(XStr);
clang::LangOptions DefaultLangOptions;
clang::SourceManager* SM = Result.SourceManager;
const clang::Expr& calleeExpr = *(launchKernel->getCallee());
OS << "hipLaunchKernelGGL(" << readSourceText(*SM, calleeExpr.getSourceRange()) << ", ";
OS << "hipLaunchKernelGGL(";
if (caleeDecl->isTemplateInstantiation()) OS << "(";
OS << readSourceText(*SM, calleeExpr->getSourceRange());
if (caleeDecl->isTemplateInstantiation()) OS << ")";
OS << ", ";
// Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
const clang::CallExpr& config = *(launchKernel->getConfig());
// Copy the two dimensional arguments verbatim.
OS << "dim3(" << readSourceText(*SM, config.getArg(0)->getSourceRange()) << "), ";
OS << "dim3(" << readSourceText(*SM, config.getArg(1)->getSourceRange()) << "), ";
OS << "dim3(" << readSourceText(*SM, config->getArg(0)->getSourceRange()) << "), ";
OS << "dim3(" << readSourceText(*SM, config->getArg(1)->getSourceRange()) << "), ";
// The stream/memory arguments default to zero if omitted.
OS << stringifyZeroDefaultedArg(*SM, config.getArg(2)) << ", ";
OS << stringifyZeroDefaultedArg(*SM, config.getArg(3));
OS << stringifyZeroDefaultedArg(*SM, config->getArg(2)) << ", ";
OS << stringifyZeroDefaultedArg(*SM, config->getArg(3));
// If there are ordinary arguments to the kernel, just copy them verbatim into our new call.
int numArgs = launchKernel->getNumArgs();
+5 -5
Просмотреть файл
@@ -121,7 +121,7 @@ struct runner
const size_t states_size = blocks * threads;
// CHECK: CUDA_CALL(hipMalloc((void **)&states, states_size * sizeof(GeneratorState)));
CUDA_CALL(cudaMalloc((void **)&states, states_size * sizeof(GeneratorState)));
// CHECK: hipLaunchKernelGGL(init_kernel, dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
// CHECK: hipLaunchKernelGGL((init_kernel), dim3(blocks), dim3(threads), 0, 0, states, seed, offset);
init_kernel<<<blocks, threads>>>(states, seed, offset);
// CHECK: CUDA_CALL(hipPeekAtLastError());
// CHECK: CUDA_CALL(hipDeviceSynchronize());
@@ -142,7 +142,7 @@ struct runner
const GenerateFunc& generate_func,
const Extra extra)
{
// CHECK: hipLaunchKernelGGL(generate_kernel, dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
// CHECK: hipLaunchKernelGGL((generate_kernel), dim3(blocks), dim3(threads), 0, 0, states, data, size, generate_func, extra);
generate_kernel<<<blocks, threads>>>(states, data, size, generate_func, extra);
}
};
@@ -223,7 +223,7 @@ struct runner<curandStateMtgp32_t>
const GenerateFunc& generate_func,
const Extra extra)
{
// CHECK: hipLaunchKernelGGL(generate_kernel, dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
// CHECK: hipLaunchKernelGGL((generate_kernel), dim3(std::min((size_t)200, blocks)), dim3(256), 0, 0, states, data, size, generate_func, extra);
generate_kernel<<<std::min((size_t)200, blocks), 256>>>(states, data, size, generate_func, extra);
}
};
@@ -304,7 +304,7 @@ struct runner<curandStateSobol32_t>
CUDA_CALL(cudaMemcpy(directions, h_directions, size, cudaMemcpyHostToDevice));
const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
// CHECK: hipLaunchKernelGGL(init_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset);
// CHECK: hipLaunchKernelGGL((init_kernel), dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, directions, offset);
init_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, directions, offset);
// CHECK: CUDA_CALL(hipPeekAtLastError());
// CHECK: CUDA_CALL(hipDeviceSynchronize());
@@ -329,7 +329,7 @@ struct runner<curandStateSobol32_t>
const Extra extra)
{
const size_t blocks_x = next_power2((blocks + dimensions - 1) / dimensions);
// CHECK: hipLaunchKernelGGL(generate_kernel, dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
// CHECK: hipLaunchKernelGGL((generate_kernel), dim3(dim3(blocks_x, dimensions)), dim3(threads), 0, 0, states, data, size / dimensions, generate_func, extra);
generate_kernel<<<dim3(blocks_x, dimensions), threads>>>(states, data, size / dimensions, generate_func, extra);
}
};
+7 -7
Просмотреть файл
@@ -10,7 +10,7 @@
#define KERNEL_CALL_AS_MACRO axpy<float><<<1, kDataLen>>>
#define KERNEL_NAME_MACRO axpy<float>
// CHECK: #define COMPLETE_LAUNCH hipLaunchKernelGGL(axpy, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y)
// CHECK: #define COMPLETE_LAUNCH hipLaunchKernelGGL((axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y)
#define COMPLETE_LAUNCH axpy<<<1, kDataLen>>>(a, device_x, device_y)
@@ -48,22 +48,22 @@ int main(int argc, char* argv[]) {
cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);
// Launch the kernel in numerous different strange ways to exercise the prerocessor.
// CHECK: hipLaunchKernelGGL(axpy, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
// CHECK: hipLaunchKernelGGL((axpy), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
axpy<<<1, kDataLen>>>(a, device_x, device_y);
// CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
// CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, a, device_x, device_y);
axpy<float><<<1, kDataLen>>>(a, device_x, device_y);
// CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, a, TOKEN_PASTE(device, _x), device_y);
// CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, a, TOKEN_PASTE(device, _x), device_y);
axpy<float><<<1, kDataLen>>>(a, TOKEN_PASTE(device, _x), device_y);
// CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
// CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
axpy<float><<<1, kDataLen>>>(ARG_LIST_AS_MACRO);
// CHECK: hipLaunchKernelGGL(KERNEL_NAME_MACRO, dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
// CHECK: hipLaunchKernelGGL((KERNEL_NAME_MACRO), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
KERNEL_NAME_MACRO<<<1, kDataLen>>>(ARG_LIST_AS_MACRO);
// CHECK: hipLaunchKernelGGL(axpy<float>, dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
// CHECK: hipLaunchKernelGGL((axpy<float>), dim3(1), dim3(kDataLen), 0, 0, ARG_LIST_AS_MACRO);
KERNEL_CALL_AS_MACRO(ARG_LIST_AS_MACRO);
// CHECK: hipLaunchKernelGGL(empty, dim3(1), dim3(kDataLen), 0, 0);
+1 -1
Просмотреть файл
@@ -94,7 +94,7 @@ int main(int argc, char *argv[])
const unsigned threadsPerBlock = 256;
printf ("info: launch 'vector_square' kernel\n");
// CHECK: hipLaunchKernelGGL(vector_square, dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
// CHECK: hipLaunchKernelGGL((vector_square), dim3(blocks), dim3(threadsPerBlock), 0, 0, C_d, A_d, N);
vector_square <<<blocks, threadsPerBlock>>> (C_d, A_d, N);
printf ("info: copy Device2Host\n");