From 2994fb342927cd494447293ba9fa25677d65e60e Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 14 Oct 2019 16:32:57 +0300 Subject: [PATCH 01/45] [HIPIFY][build fix] Add missing new source file --- hipify-clang/src/CUDA2HIP_CUB_API_types.cpp | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 hipify-clang/src/CUDA2HIP_CUB_API_types.cpp diff --git a/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp b/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp new file mode 100644 index 0000000000..0ef1912b54 --- /dev/null +++ b/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp @@ -0,0 +1,28 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "CUDA2HIP.h" + +// Maps the names of CUDA CUB API types to the corresponding HIP types +const std::map CUDA_CUB_TYPE_NAME_MAP{ + {"cub", {"hipcub", "", CONV_TYPE, API_CUB}}, +}; From c0f62157f917ef9aab4bd23798fe4135cf8a2929 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 14 Oct 2019 18:08:29 +0300 Subject: [PATCH 02/45] [HIPIFY] HipifyAction code clean-up --- hipify-clang/src/HipifyAction.cpp | 301 +++++++++++++----------------- hipify-clang/src/HipifyAction.h | 28 +-- 2 files changed, 141 insertions(+), 188 deletions(-) diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp index c6d10f0cdc..930f3ec8c5 100644 --- a/hipify-clang/src/HipifyAction.cpp +++ b/hipify-clang/src/HipifyAction.cpp @@ -33,16 +33,20 @@ THE SOFTWARE. #include "StringUtils.h" #include "ArgParse.h" -namespace ct = clang::tooling; -namespace mat = clang::ast_matchers; - +const std::string sHIP = "HIP"; +const std::string sROC = "ROC"; +const std::string sCub = "cub"; const std::string sHIP_DYNAMIC_SHARED = "HIP_DYNAMIC_SHARED"; +const std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME"; std::string sHIP_SYMBOL = "HIP_SYMBOL"; -std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME"; std::string s_reinterpret_cast = "reinterpret_cast"; -const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL("; +const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL"; const std::string sDim3 = "dim3("; - +const std::string s_hiprand_kernel_h = "hiprand_kernel.h"; +const std::string s_hiprand_h = "hiprand.h"; +const std::string sOnce = "once"; +const std::string s_string_literal = "[string literal]"; +// CUDA identifiers, used in matchers const std::string sCudaMemcpyToSymbol = "cudaMemcpyToSymbol"; const std::string sCudaMemcpyToSymbolAsync = "cudaMemcpyToSymbolAsync"; const std::string sCudaGetSymbolSize = "cudaGetSymbolSize"; @@ -51,6 +55,12 @@ const std::string sCudaMemcpyFromSymbol = "cudaMemcpyFromSymbol"; const std::string sCudaMemcpyFromSymbolAsync = "cudaMemcpyFromSymbolAsync"; const std::string sCudaFuncSetCacheConfig = "cudaFuncSetCacheConfig"; const std::string sCudaFuncGetAttributes = "cudaFuncGetAttributes"; +// Matchers' names +const StringRef sCudaSharedIncompleteArrayVar = "cudaSharedIncompleteArrayVar"; +const StringRef sCudaLaunchKernel = "cudaLaunchKernel"; +const StringRef sCudaHostFuncCall = "cudaHostFuncCall"; +const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall"; +const StringRef sCubNamespacePrefix = "cubNamespacePrefix"; std::set DeviceSymbolFunctions0 { {sCudaMemcpyToSymbol}, @@ -78,7 +88,7 @@ std::set ReinterpretFunctions1{ }; void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + auto &SM = getCompilerInstance().getSourceManager(); size_t begin = 0; while ((begin = s.find("cu", begin)) != StringRef::npos) { const size_t end = s.find_first_of(" ", begin + 4); @@ -86,7 +96,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { const auto found = CUDA_RENAMES_MAP().find(name); if (found != CUDA_RENAMES_MAP().end()) { StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName; - hipCounter counter = {"[string literal]", "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree}; + hipCounter counter = {s_string_literal, "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree}; Statistics::current().incrementCounter(counter, name.str()); if (!Statistics::isUnsupported(counter)) { clang::SourceLocation sl = start.getLocWithOffset(begin + 1); @@ -95,9 +105,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { insertReplacement(Rep, fullSL); } } - if (end == StringRef::npos) { - break; - } + if (end == StringRef::npos) break; begin = end + 1; } } @@ -109,7 +117,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { * If it's an unsupported CUDA identifier, a warning is emitted. * Otherwise, the source file is updated with the corresponding hipification. */ -void HipifyAction::RewriteToken(const clang::Token& t) { +void HipifyAction::RewriteToken(const clang::Token &t) { // String literals containing CUDA references need fixing. if (t.is(clang::tok::string_literal)) { StringRef s(t.getLiteralData(), t.getLength()); @@ -124,20 +132,21 @@ void HipifyAction::RewriteToken(const clang::Token& t) { FindAndReplace(name, sl, CUDA_RENAMES_MAP()); } -void HipifyAction::FindAndReplace(llvm::StringRef name, +void HipifyAction::FindAndReplace(StringRef name, clang::SourceLocation sl, - const std::map& repMap, bool bReplace) { + const std::map &repMap, + bool bReplace) { const auto found = repMap.find(name); if (found == repMap.end()) { // So it's an identifier, but not CUDA? Boring. return; } Statistics::current().incrementCounter(found->second, name.str()); - clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics(); + clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics(); // Warn the user about unsupported identifier. if (Statistics::isUnsupported(found->second)) { std::string sWarn; - Statistics::isToRoc(found->second) ? sWarn = "ROC" : sWarn = "HIP"; + Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP; sWarn = "" + sWarn; const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier is unsupported in %0."); DE.Report(sl, ID) << sWarn; @@ -147,7 +156,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name, return; } StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName; - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + auto &SM = getCompilerInstance().getSourceManager(); ct::Replacement Rep(SM, sl, name.size(), repName.str()); clang::FullSourceLoc fullSL(sl, SM); insertReplacement(Rep, fullSL); @@ -155,7 +164,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name, namespace { -clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) { +clang::SourceRange getReadRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) { clang::SourceLocation begin = exprRange.getBegin(); clang::SourceLocation end = exprRange.getEnd(); bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{}); @@ -167,7 +176,7 @@ clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRan } } -clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) { +clang::SourceRange getWriteRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) { clang::SourceLocation begin = exprRange.getBegin(); clang::SourceLocation end = exprRange.getEnd(); // If the range is contained within a macro, update the macro definition. @@ -178,7 +187,7 @@ clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRa return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)}; } -StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) { +StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exprRange) { return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr); } @@ -186,53 +195,50 @@ StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exp * Get a string representation of the expression `arg`, unless it's a defaulting function * call argument, in which case get a 0. Used for building argument lists to kernel calls. */ -std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Expr* arg) { - if (clang::isa(arg)) { - return "0"; - } else { - return readSourceText(SM, arg->getSourceRange()); - } +std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) { + if (clang::isa(arg)) return "0"; + else return readSourceText(SM, arg->getSourceRange()); } } // anonymous namespace -bool HipifyAction::Exclude(const hipCounter & hipToken) { +bool HipifyAction::Exclude(const hipCounter &hipToken) { switch (hipToken.type) { case CONV_INCLUDE_CUDA_MAIN_H: switch (hipToken.apiType) { case API_DRIVER: case API_RUNTIME: - if (insertedRuntimeHeader) { return true; } + if (insertedRuntimeHeader) return true; insertedRuntimeHeader = true; return false; case API_BLAS: - if (insertedBLASHeader) { return true; } + if (insertedBLASHeader) return true; insertedBLASHeader = true; return false; case API_RAND: - if (hipToken.hipName == "hiprand_kernel.h") { - if (insertedRAND_kernelHeader) { return true; } + if (hipToken.hipName == s_hiprand_kernel_h) { + if (insertedRAND_kernelHeader) return true; insertedRAND_kernelHeader = true; return false; - } else if (hipToken.hipName == "hiprand.h") { - if (insertedRANDHeader) { return true; } + } else if (hipToken.hipName == s_hiprand_h) { + if (insertedRANDHeader) return true; insertedRANDHeader = true; return false; } case API_DNN: - if (insertedDNNHeader) { return true; } + if (insertedDNNHeader) return true; insertedDNNHeader = true; return false; case API_FFT: - if (insertedFFTHeader) { return true; } + if (insertedFFTHeader) return true; insertedFFTHeader = true; return false; case API_COMPLEX: - if (insertedComplexHeader) { return true; } + if (insertedComplexHeader) return true; insertedComplexHeader = true; return false; case API_SPARSE: - if (insertedSPARSEHeader) { return true; } + if (insertedSPARSEHeader) return true; insertedSPARSEHeader = true; return false; default: @@ -240,13 +246,11 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) { } return false; case CONV_INCLUDE: - if (hipToken.hipName.empty()) { - return true; - } + if (hipToken.hipName.empty()) return true; switch (hipToken.apiType) { case API_RAND: - if (hipToken.hipName == "hiprand_kernel.h") { - if (insertedRAND_kernelHeader) { return true; } + if (hipToken.hipName == s_hiprand_kernel_h) { + if (insertedRAND_kernelHeader) return true; insertedRAND_kernelHeader = true; } return false; @@ -267,24 +271,19 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, clang::CharSourceRange filename_range, const clang::FileEntry*, StringRef, StringRef, const clang::Module*) { - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - if (!SM.isWrittenInMainFile(hash_loc)) { - return; - } + auto &SM = getCompilerInstance().getSourceManager(); + if (!SM.isWrittenInMainFile(hash_loc)) return; if (!firstHeader) { firstHeader = true; firstHeaderLoc = hash_loc; } const auto found = CUDA_INCLUDE_MAP.find(file_name); - if (found == CUDA_INCLUDE_MAP.end()) { - return; - } + if (found == CUDA_INCLUDE_MAP.end()) return; bool exclude = Exclude(found->second); Statistics::current().incrementCounter(found->second, file_name.str()); - clang::SourceLocation sl = filename_range.getBegin(); if (Statistics::isUnsupported(found->second)) { - clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics(); + clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics(); DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header.")); return; } @@ -293,11 +292,8 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, if (!exclude) { clang::SmallString<128> includeBuffer; llvm::StringRef name = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName; - if (is_angled) { - newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer); - } else { - newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer); - } + if (is_angled) newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer); + else newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer); } else { // hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'. sl = hash_loc; @@ -309,46 +305,33 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, } void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) { - if (pragmaOnce) { - return; - } - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - if (!SM.isWrittenInMainFile(Loc)) { - return; - } - clang::Preprocessor& PP = getCompilerInstance().getPreprocessor(); + if (pragmaOnce) return; + auto &SM = getCompilerInstance().getSourceManager(); + if (!SM.isWrittenInMainFile(Loc)) return; + clang::Preprocessor &PP = getCompilerInstance().getPreprocessor(); clang::Token tok; PP.Lex(tok); StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength()); - if (Text == "once") { + if (Text == sOnce) { pragmaOnce = true; pragmaOnceLoc = tok.getEndLoc(); } } -bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - StringRef refName = "cudaLaunchKernel"; - const auto* launchKernel = Result.Nodes.getNodeAs(refName); - if (!launchKernel) { - return false; - } - const clang::Expr* calleeExpr = launchKernel->getCallee(); - if (!calleeExpr) { - return false; - } - const clang::FunctionDecl *caleeDecl = launchKernel->getDirectCallee(); - if (!caleeDecl) { - return false; - } - const clang::CallExpr* config = launchKernel->getConfig(); - if (!config) { - return false; - } +bool HipifyAction::cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result) { + auto *launchKernel = Result.Nodes.getNodeAs(sCudaLaunchKernel); + if (!launchKernel) return false; + auto *calleeExpr = launchKernel->getCallee(); + if (!calleeExpr) return false; + auto *caleeDecl = launchKernel->getDirectCallee(); + if (!caleeDecl) return false; + auto *config = launchKernel->getConfig(); + if (!config) return false; clang::SmallString<40> XStr; llvm::raw_svector_ostream OS(XStr); clang::LangOptions DefaultLangOptions; - clang::SourceManager* SM = Result.SourceManager; - OS << sHipLaunchKernelGGL; + auto *SM = Result.SourceManager; + OS << sHipLaunchKernelGGL << "("; if (caleeDecl->isTemplateInstantiation()) OS << sHIP_KERNEL_NAME << "("; OS << readSourceText(*SM, calleeExpr->getSourceRange()); if (caleeDecl->isTemplateInstantiation()) OS << ")"; @@ -381,29 +364,24 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc ct::Replacement Rep(*SM, launchStart, length, OS.str()); clang::FullSourceLoc fullSL(launchStart, *SM); insertReplacement(Rep, fullSL); - hipCounter counter = {"hipLaunchKernelGGL", "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME}; - Statistics::current().incrementCounter(counter, refName.str()); + hipCounter counter = {sHipLaunchKernelGGL, "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME}; + Statistics::current().incrementCounter(counter, sCudaLaunchKernel.str()); return true; } -bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - StringRef refName = "cudaSharedIncompleteArrayVar"; - auto* sharedVar = Result.Nodes.getNodeAs(refName); - if (!sharedVar) { - return false; - } +bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result) { + auto *sharedVar = Result.Nodes.getNodeAs(sCudaSharedIncompleteArrayVar); + if (!sharedVar) return false; // Example: extern __shared__ uint sRadix1[]; - if (!sharedVar->hasExternalFormalLinkage()) { - return false; - } + if (!sharedVar->hasExternalFormalLinkage()) return false; clang::QualType QT = sharedVar->getType(); std::string typeName; if (QT->isIncompleteArrayType()) { - const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); + const clang::ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); QT = AT->getElementType(); if (QT.getTypePtr()->isBuiltinType()) { QT = QT.getCanonicalType(); - const auto* BT = clang::dyn_cast(QT); + auto *BT = clang::dyn_cast(QT); if (BT) { clang::LangOptions LO; LO.CUDA = true; @@ -417,7 +395,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match if (!typeName.empty()) { clang::SourceLocation slStart = sharedVar->getOuterLocStart(); clang::SourceLocation slEnd = llcompat::getEndLoc(sharedVar->getTypeSourceInfo()->getTypeLoc()); - clang::SourceManager* SM = Result.SourceManager; + auto *SM = Result.SourceManager; size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1; std::string varName = sharedVar->getNameAsString(); std::string repName = sHIP_DYNAMIC_SHARED + "(" + typeName + ", " + varName + ")"; @@ -425,52 +403,39 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match clang::FullSourceLoc fullSL(slStart, *SM); insertReplacement(Rep, fullSL); hipCounter counter = {sHIP_DYNAMIC_SHARED, "", ConvTypes::CONV_EXTERN_SHARED, ApiTypes::API_RUNTIME}; - Statistics::current().incrementCounter(counter, refName.str()); + Statistics::current().incrementCounter(counter, sCudaSharedIncompleteArrayVar.str()); return true; } return false; } -bool HipifyAction::cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - if (const clang::CallExpr *call = Result.Nodes.getNodeAs("cudaDeviceFuncCall")) { - const clang::FunctionDecl *funcDcl = call->getDirectCallee(); - if (!funcDcl) { - return false; - } +bool HipifyAction::cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result) { + if (const clang::CallExpr *call = Result.Nodes.getNodeAs(sCudaDeviceFuncCall)) { + auto *funcDcl = call->getDirectCallee(); + if (!funcDcl) return false; FindAndReplace(funcDcl->getDeclName().getAsString(), llcompat::getBeginLoc(call), CUDA_DEVICE_FUNC_MAP, false); return true; } return false; } -bool HipifyAction::cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - if (const clang::TypedefNameDecl *decl = Result.Nodes.getNodeAs("cubNamespacePrefix")) { - if (!decl) { - return false; - } +bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result) { + if (auto *decl = Result.Nodes.getNodeAs(sCubNamespacePrefix)) { clang::QualType QT = decl->getUnderlyingType(); - const clang::Type* t = QT.getTypePtr(); - if (!t) { - return false; - } - const clang::ElaboratedType* et = t->getAs(); - if (!et) { - return false; - } + auto *t = QT.getTypePtr(); + if (!t) return false; + const clang::ElaboratedType *et = t->getAs(); + if (!et) return false; const clang::NestedNameSpecifier *nns = et->getQualifier(); - if (!nns) { - return false; - } + if (!nns) return false; const clang::NamespaceDecl *nsd = nns->getAsNamespace(); - if (!nsd) { - return false; - } + if (!nsd) return false; const clang::TypeSourceInfo *si = decl->getTypeSourceInfo(); const clang::TypeLoc tloc = si->getTypeLoc(); const clang::SourceRange sr = tloc.getSourceRange(); clang::SourceLocation sl(sr.getBegin()); clang::SourceLocation end(sr.getEnd()); - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + auto &SM = getCompilerInstance().getSourceManager(); size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); StringRef sfull = StringRef(SM.getCharacterData(sl), length); std::string name = nsd->getDeclName().getAsString(); @@ -484,15 +449,11 @@ bool HipifyAction::cubNamespacePrefix(const clang::ast_matchers::MatchFinder::Ma return false; } -bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - if (const clang::CallExpr * call = Result.Nodes.getNodeAs("cudaHostFuncCall")) { - if (!call->getNumArgs()) { - return false; - } - const clang::FunctionDecl* funcDcl = call->getDirectCallee(); - if (!funcDcl) { - return false; - } +bool HipifyAction::cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result) { + if (auto *call = Result.Nodes.getNodeAs(sCudaHostFuncCall)) { + if (!call->getNumArgs()) return false; + auto *funcDcl = call->getDirectCallee(); + if (!funcDcl) return false; std::string sName = funcDcl->getDeclName().getAsString(); unsigned int argNum = 0; bool b_reinterpret = (ReinterpretFunctions.find(sName) != ReinterpretFunctions.end()) ? true : false; @@ -506,7 +467,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc clang::SmallString<40> XStr; llvm::raw_svector_ostream OS(XStr); clang::SourceRange sr = call->getArg(argNum)->getSourceRange(); - clang::SourceManager* SM = Result.SourceManager; + auto *SM = Result.SourceManager; OS << (b_reinterpret ? s_reinterpret_cast : sHIP_SYMBOL) << "(" << readSourceText(*SM, sr) << ")"; clang::SourceRange replacementRange = getWriteRange(*SM, { sr.getBegin(), sr.getEnd() }); clang::SourceLocation s = replacementRange.getBegin(); @@ -521,7 +482,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc return false; } -void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) { +void HipifyAction::insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL) { llcompat::insertReplacement(*replacements, rep); if (PrintStats) { rep.getLength(); @@ -530,10 +491,10 @@ void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::Fu } } -std::unique_ptr HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) { - Finder.reset(new clang::ast_matchers::MatchFinder); +std::unique_ptr HipifyAction::CreateASTConsumer(clang::CompilerInstance &CI, StringRef) { + Finder.reset(new mat::MatchFinder); // Replace the <<<...>>> language extension with a hip kernel launch - Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this); + Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind(sCudaLaunchKernel), this); Finder->addMatcher( mat::varDecl( mat::isExpansionInMainFile(), @@ -541,7 +502,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi mat::hasAttr(clang::attr::CUDAShared), mat::hasType(mat::incompleteArrayType()) ) - ).bind("cudaSharedIncompleteArrayVar"), + ).bind(sCudaSharedIncompleteArrayVar), this ); Finder->addMatcher( @@ -561,7 +522,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi ) ) ) - ).bind("cudaHostFuncCall"), + ).bind(sCudaHostFuncCall), this ); Finder->addMatcher( @@ -576,7 +537,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi mat::unless(mat::hasAttr(clang::attr::CUDAHost)) ) ) - ).bind("cudaDeviceFuncCall"), + ).bind(sCudaDeviceFuncCall), this ); Finder->addMatcher( @@ -586,12 +547,12 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi mat::elaboratedType( mat::hasQualifier( mat::specifiesNamespace( - mat::hasName("cub") + mat::hasName(sCub) ) ) ) ) - ).bind("cubNamespacePrefix"), + ).bind(sCubNamespacePrefix), this ); // Ownership is transferred to the caller. @@ -599,10 +560,8 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi } void HipifyAction::Ifndef(clang::SourceLocation Loc, const clang::Token &MacroNameTok, const clang::MacroDefinition &MD) { - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - if (!SM.isWrittenInMainFile(Loc)) { - return; - } + auto &SM = getCompilerInstance().getSourceManager(); + if (!SM.isWrittenInMainFile(Loc)) return; StringRef Text(SM.getCharacterData(MacroNameTok.getLocation()), MacroNameTok.getLength()); Ifndefs.insert(std::make_pair(Text.str(), MacroNameTok.getEndLoc())); } @@ -615,12 +574,12 @@ void HipifyAction::EndSourceFileAction() { // one copy of the hip include into every file. bool placeForIncludeCalculated = false; clang::SourceLocation sl, controllingMacroLoc; - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - clang::Preprocessor& PP = getCompilerInstance().getPreprocessor(); - clang::HeaderSearch& HS = PP.getHeaderSearchInfo(); - clang::ExternalPreprocessorSource* EPL = HS.getExternalLookup(); - const clang::FileEntry* FE = SM.getFileEntryForID(SM.getMainFileID()); - const clang::IdentifierInfo* controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL); + auto &SM = getCompilerInstance().getSourceManager(); + clang::Preprocessor &PP = getCompilerInstance().getPreprocessor(); + clang::HeaderSearch &HS = PP.getHeaderSearchInfo(); + clang::ExternalPreprocessorSource *EPL = HS.getExternalLookup(); + const clang::FileEntry *FE = SM.getFileEntryForID(SM.getMainFileID()); + const clang::IdentifierInfo *controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL); if (controllingMacro) { auto found = Ifndefs.find(controllingMacro->getName().str()); if (found != Ifndefs.end()) { @@ -629,19 +588,13 @@ void HipifyAction::EndSourceFileAction() { } } if (pragmaOnce) { - if (placeForIncludeCalculated) { - sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc; - } else { - sl = pragmaOnceLoc; - } + if (placeForIncludeCalculated) sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc; + else sl = pragmaOnceLoc; placeForIncludeCalculated = true; } if (!placeForIncludeCalculated) { - if (firstHeader) { - sl = firstHeaderLoc; - } else { - sl = SM.getLocForStartOfFile(SM.getMainFileID()); - } + if (firstHeader) sl = firstHeaderLoc; + else sl = SM.getLocForStartOfFile(SM.getMainFileID()); } clang::FullSourceLoc fullSL(sl, SM); ct::Replacement Rep(SM, sl, 0, "\n#include \n"); @@ -656,15 +609,15 @@ namespace { * A silly little class to proxy PPCallbacks back to the HipifyAction class. */ class PPCallbackProxy : public clang::PPCallbacks { - HipifyAction& hipifyAction; + HipifyAction &hipifyAction; public: - explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {} + explicit PPCallbackProxy(HipifyAction &action): hipifyAction(action) {} - void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token, + void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token, StringRef file_name, bool is_angled, clang::CharSourceRange filename_range, - const clang::FileEntry* file, StringRef search_path, StringRef relative_path, - const clang::Module* imported + const clang::FileEntry *file, StringRef search_path, StringRef relative_path, + const clang::Module *imported #if LLVM_VERSION_MAJOR > 6 , clang::SrcMgr::CharacteristicKind FileType #endif @@ -688,10 +641,10 @@ bool HipifyAction::BeginInvocation(clang::CompilerInstance &CI) { } void HipifyAction::ExecuteAction() { - clang::Preprocessor& PP = getCompilerInstance().getPreprocessor(); - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + clang::Preprocessor &PP = getCompilerInstance().getPreprocessor(); + auto &SM = getCompilerInstance().getSourceManager(); // Start lexing the specified input file. - const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID()); + const llvm::MemoryBuffer *FromFile = SM.getBuffer(SM.getMainFileID()); clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts()); RawLex.SetKeepWhitespaceMode(true); // Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough @@ -709,7 +662,7 @@ void HipifyAction::ExecuteAction() { clang::ASTFrontendAction::ExecuteAction(); } -void HipifyAction::run(const clang::ast_matchers::MatchFinder::MatchResult& Result) { +void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) { if (cudaLaunchKernel(Result)) return; if (cudaSharedIncompleteArrayVar(Result)) return; if (cudaHostFuncCall(Result)) return; diff --git a/hipify-clang/src/HipifyAction.h b/hipify-clang/src/HipifyAction.h index a24404deee..78532449ee 100644 --- a/hipify-clang/src/HipifyAction.h +++ b/hipify-clang/src/HipifyAction.h @@ -31,17 +31,18 @@ THE SOFTWARE. #include "Statistics.h" namespace ct = clang::tooling; +namespace mat = clang::ast_matchers; using namespace llvm; /** * A FrontendAction that hipifies CUDA programs. */ class HipifyAction : public clang::ASTFrontendAction, - public clang::ast_matchers::MatchFinder::MatchCallback { + public mat::MatchFinder::MatchCallback { private: - ct::Replacements* replacements; + ct::Replacements *replacements; std::map Ifndefs; - std::unique_ptr Finder; + std::unique_ptr Finder; // CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent // ones, and track - using this flag - if the result led to us including the hip runtime header. If it did // not, we insert it at the top of the file when we finish processing it. @@ -67,12 +68,11 @@ public: explicit HipifyAction(ct::Replacements *replacements): clang::ASTFrontendAction(), replacements(replacements) {} // MatchCallback listeners - bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result); + bool cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result); + bool cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result); + bool cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result); + bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result); + bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result); // Called by the preprocessor for each include directive during the non-raw lexing pass. void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token, @@ -91,7 +91,7 @@ public: protected: // Add a Replacement for the current file. These will all be applied after executing the FrontendAction. - void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL); + void insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL); // FrontendAction entry point. void ExecuteAction() override; // Callback before starting processing a single input; used by hipify-clang for setting Preprocessor options. @@ -99,8 +99,8 @@ protected: // Called at the start of each new file to process. void EndSourceFileAction() override; // MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an interest for. - void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override; - std::unique_ptr CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override; - bool Exclude(const hipCounter & hipToken); - void FindAndReplace(llvm::StringRef name, clang::SourceLocation sl, const std::map& repMap, bool bReplace = true); + void run(const mat::MatchFinder::MatchResult &Result) override; + std::unique_ptr CreateASTConsumer(clang::CompilerInstance &CI, StringRef InFile) override; + bool Exclude(const hipCounter &hipToken); + void FindAndReplace(StringRef name, clang::SourceLocation sl, const std::map &repMap, bool bReplace = true); }; From d57c9a844bda3d18c477cd5e78c4673d54495faa Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 15 Oct 2019 11:26:03 +0300 Subject: [PATCH 03/45] [HIPIFY][cmake] Make CMakeLists.txt compatible with default cmake 3.5.2 for Ubuntu 16.04 + Update README.md accordingly --- hipify-clang/CMakeLists.txt | 10 +++------- hipify-clang/README.md | 6 +++--- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt index 5eb71b3a3e..fd172a9c8d 100644 --- a/hipify-clang/CMakeLists.txt +++ b/hipify-clang/CMakeLists.txt @@ -1,8 +1,4 @@ -if (CUDA_VERSION VERSION_GREATER "9.2") - cmake_minimum_required(VERSION 3.12.3) -else() - cmake_minimum_required(VERSION 3.7.2) -endif() +cmake_minimum_required(VERSION 3.5.2) project(hipify-clang) @@ -58,7 +54,7 @@ target_link_libraries(hipify-clang PRIVATE LLVMOption LLVMCore) -if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "7") +if (LLVM_PACKAGE_VERSION VERSION_GREATER "6.0.1") target_link_libraries(hipify-clang PRIVATE clangToolingInclusions) endif() @@ -72,7 +68,7 @@ else() set(StdCpp "-std=c++") endif() -if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "10.0") +if (LLVM_PACKAGE_VERSION VERSION_GREATER "9.0") string(APPEND StdCpp "14") # MSVC starting from 1900 (VS 2015) supports only the following c++ std values: c++14|c++17|c++latest elseif (MSVC) diff --git a/hipify-clang/README.md b/hipify-clang/README.md index 07466dbe62..bb9654d8b4 100644 --- a/hipify-clang/README.md +++ b/hipify-clang/README.md @@ -63,7 +63,7 @@ In most cases, you can get a suitable version of LLVM+CLANG with your package manager. Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set -[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.12/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist` +[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist` ## Build and install @@ -196,7 +196,7 @@ Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38 Build system for the above configurations: -Python 2.7 (min), cmake 3.12.3 (min), GNU C/C++ 5.4.0 (min). +Python 2.7 (min), cmake 3.5.2 (min), GNU C/C++ 5.4.0 (min). Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`: @@ -345,7 +345,7 @@ LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn-7.0.5.15 - cudnn-7.6.4.38 Build system for the above configurations: -Python 3.6 - 3.7.4, cmake 3.12.3 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4). +Python 3.6 - 3.7.4, cmake 3.5.2 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4). Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`: From 3444834e3a465c750f2be76588fd4c90e964797e Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 15 Oct 2019 14:20:23 +0300 Subject: [PATCH 04/45] [HIPIFY][tests] Exclude tests for the libs, which are not defined in cmake command line + affects cuDNN and CUB tests, paths to libraries of which are defined by CUDA_DNN_ROOT_DIR and CUDA_CUB_ROOT_DIR + Warn about excluding and why, for instance: "WARN: cuDNN tests are excluded due to unset CUDA_DNN_ROOT_DIR" --- tests/hipify-clang/lit.cfg | 46 ++++++++++++++++++++++++++++---------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/tests/hipify-clang/lit.cfg b/tests/hipify-clang/lit.cfg index 1d092a4327..89e17249f0 100644 --- a/tests/hipify-clang/lit.cfg +++ b/tests/hipify-clang/lit.cfg @@ -12,18 +12,31 @@ import lit.util site_cfg = lit_config.params.get('site_config', None) lit_config.load_config(config, site_cfg) -print(str("========================================")) +config.excludes = ['cmdparser.hpp'] +config.excludes.append('spatial_batch_norm_op.h') +config.excludes.append('common_cudnn.h') + +delimiter = "==============================================================="; +print(delimiter) print("CUDA " + config.cuda_version + " - will be used for testing") print("LLVM " + config.llvm_version + " - will be used for testing") print(platform.machine() + " - Platform architecture") print(platform.system() + " " + platform.release() + " - Platform OS") print(str(config.pointer_size * 8) + " - hipify-clang binary bitness") print(str(struct.calcsize("P") * 8) + " - python " + str(platform.python_version()) + " binary bitness") -print(str("========================================")) - -config.excludes = ['cmdparser.hpp'] -config.excludes.append('spatial_batch_norm_op.h') -config.excludes.append('common_cudnn.h') +print(delimiter) +warns = None +if not config.cuda_dnn_root: + config.excludes.append('cudnn_convolution_forward.cu') + config.excludes.append('cudnn_softmax.cu') + print("WARN: cuDNN tests are excluded due to unset CUDA_DNN_ROOT_DIR") + warns = True +if not config.cuda_cub_root: + config.excludes.append('cub_01.cu') + print("WARN: CUB tests are excluded due to unset CUDA_CUB_ROOT_DIR") + warns = True +if warns: + print(delimiter) if config.cuda_version_major == 7 and config.cuda_version_minor == 0: config.excludes.append('headers_test_09.cu') @@ -90,17 +103,26 @@ else: run_test_ext = ".sh" # CUDA SDK ROOT clang_arguments += " -isystem'%s'/samples/common/inc" -# cuDNN ROOT -clang_arguments += " -I'%s'/include" if config.pointer_size == 8: clang_arguments += " -D__LP64__" +# cuDNN ROOT +if config.cuda_dnn_root: + clang_arguments += " -I'%s'/include" # CUB ROOT -clang_arguments += " -I'%s'" +if config.cuda_cub_root: + clang_arguments += " -I'%s'" + +if config.cuda_dnn_root and config.cuda_cub_root: + config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root))) +elif config.cuda_dnn_root: + config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root))) +elif config.cuda_cub_root: + config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_cub_root))) +else: + config.substitutions.append(("%clang_args", clang_arguments % config.cuda_sdk_root)) hipify_arguments = "--cuda-path='%s'" - -config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root))) -config.substitutions.append(("%hipify_args", hipify_arguments % (config.cuda_root))) +config.substitutions.append(("%hipify_args", hipify_arguments % config.cuda_root)) config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"')) config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"')) From 0b52c1d9d839864a36ee019f7ce3345c405e1fe6 Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Tue, 15 Oct 2019 14:18:42 -0400 Subject: [PATCH 05/45] Update the declarations of hipMemsetD8, hipMemsetD8Async, hipMemsetD16, hipMemsetD16Async. These functions are type aware and take in as their third argument the number of elements in the buffer, not the buffer size. Change the name of this argument from sizeBytes to count to align with the above description. --- include/hip/hcc_detail/hip_runtime_api.h | 16 ++++++++-------- src/hip_memory.cpp | 24 ++++++++++++------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index c07d2ad9f1..b4402fd67a 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -1742,10 +1742,10 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes); * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes); +hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count); /** * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant @@ -1758,11 +1758,11 @@ hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeByte * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @param[in] stream - Stream identifier * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes, hipStream_t stream __dparm(0)); +hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count, hipStream_t stream __dparm(0)); /** * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant @@ -1770,10 +1770,10 @@ hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t siz * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes); +hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count); /** * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant @@ -1786,11 +1786,11 @@ hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBy * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @param[in] stream - Stream identifier * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes, hipStream_t stream __dparm(0)); +hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count, hipStream_t stream __dparm(0)); /** * @brief Fills the memory area pointed to by dest with the constant integer diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index ed1422fcda..48b83287f3 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -1935,15 +1935,15 @@ hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, si return ihipLogStatus(e); }; -hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes) { - HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, sizeBytes); +hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t count) { + HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, count); hipError_t e = hipSuccess; hipStream_t stream = hipStreamNull; stream = ihipSyncAndResolveStream(stream); if (stream) { - e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar); + e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar); stream->locked_wait(); } else { e = hipErrorInvalidValue; @@ -1951,23 +1951,23 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes return ihipLogStatus(e); } -hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes , hipStream_t stream ) { - HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, sizeBytes, stream); +hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t count , hipStream_t stream ) { + HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, count, stream); stream = ihipSyncAndResolveStream(stream); if (stream) { - return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar)); + return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar)); } else { return ihipLogStatus(hipErrorInvalidValue); } } -hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes){ - HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, sizeBytes); +hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t count){ + HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, count); hipError_t e = hipSuccess; hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); if (stream) { - e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort); + e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort); if(hipSuccess == e) stream->locked_wait(); } else { @@ -1976,12 +1976,12 @@ hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeByt return ihipLogStatus(e); } -hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes, hipStream_t stream ){ - HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, sizeBytes, stream); +hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t count, hipStream_t stream ){ + HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, count, stream); stream = ihipSyncAndResolveStream(stream); if (stream) { - return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort)); + return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort)); } else { return ihipLogStatus(hipErrorInvalidValue); } From 3db2ecc52b379caaf8f80842089701b37098a323 Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Tue, 15 Oct 2019 14:20:14 -0400 Subject: [PATCH 06/45] hipMemset2D test should pass only if both async and sync subtests pass. --- tests/src/runtimeApi/memory/hipMemset2D.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemset2D.cpp b/tests/src/runtimeApi/memory/hipMemset2D.cpp index b3bcf42222..73f3f5d415 100644 --- a/tests/src/runtimeApi/memory/hipMemset2D.cpp +++ b/tests/src/runtimeApi/memory/hipMemset2D.cpp @@ -113,8 +113,8 @@ int main(int argc, char *argv[]) hipCtxCreate(&context, 0, p_gpuDevice); bool testResult = false; - testResult = testhipMemset2D(memsetval, p_gpuDevice); - testResult = testhipMemset2DAsync(memsetval, p_gpuDevice); + testResult &= testhipMemset2D(memsetval, p_gpuDevice); + testResult &= testhipMemset2DAsync(memsetval, p_gpuDevice); hipCtxDestroy(context); if(testResult){ passed(); From 00425bdf3dd3a26fe502458d0cf77349603be1af Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Tue, 15 Oct 2019 14:24:04 -0400 Subject: [PATCH 07/45] Add async subtest to hipMemSet3D --- tests/src/runtimeApi/memory/hipMemset3D.cpp | 55 ++++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemset3D.cpp b/tests/src/runtimeApi/memory/hipMemset3D.cpp index 11bd656761..ac26280314 100644 --- a/tests/src/runtimeApi/memory/hipMemset3D.cpp +++ b/tests/src/runtimeApi/memory/hipMemset3D.cpp @@ -79,12 +79,63 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice) return testResult; } +bool testhipMemset3DAsync(int memsetval,int p_gpuDevice) +{ + size_t numH = 256; + size_t numW = 256; + size_t depth = 10; + size_t width = numW * sizeof(char); + size_t sizeElements = width * numH * depth; + size_t elements = numW* numH* depth; + + + printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice); + char *A_h; + bool testResult = true; + hipExtent extent = make_hipExtent(width, numH, depth); + hipPitchedPtr devPitchedPtr; + + HIPCHECK(hipMalloc3D(&devPitchedPtr, extent)); + A_h = (char*)malloc(sizeElements); + HIPASSERT(A_h != NULL); + for (size_t i=0; i Date: Tue, 15 Oct 2019 15:29:14 -0400 Subject: [PATCH 08/45] Update indentation in the hipMemset3D test. Replace all tabs with four spaces. --- tests/src/runtimeApi/memory/hipMemset3D.cpp | 52 ++++++++++----------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemset3D.cpp b/tests/src/runtimeApi/memory/hipMemset3D.cpp index ac26280314..ce2459a438 100644 --- a/tests/src/runtimeApi/memory/hipMemset3D.cpp +++ b/tests/src/runtimeApi/memory/hipMemset3D.cpp @@ -41,27 +41,27 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice) size_t elements = numW* numH* depth; - printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice); + printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice); char *A_h; bool testResult = true; hipExtent extent = make_hipExtent(width, numH, depth); hipPitchedPtr devPitchedPtr; HIPCHECK(hipMalloc3D(&devPitchedPtr, extent)); - A_h = (char*)malloc(sizeElements); - HIPASSERT(A_h != NULL); - for (size_t i=0; i Date: Tue, 15 Oct 2019 17:15:49 -0400 Subject: [PATCH 09/45] In the hipMemset2D and hipMemset3D tests synchronize with the default stream after performing an async memset. --- tests/src/runtimeApi/memory/hipMemset2D.cpp | 7 ++++--- tests/src/runtimeApi/memory/hipMemset3D.cpp | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemset2D.cpp b/tests/src/runtimeApi/memory/hipMemset2D.cpp index 73f3f5d415..449f0b6f78 100644 --- a/tests/src/runtimeApi/memory/hipMemset2D.cpp +++ b/tests/src/runtimeApi/memory/hipMemset2D.cpp @@ -45,7 +45,7 @@ bool testhipMemset2D(int memsetval,int p_gpuDevice) char *A_d; char *A_h; bool testResult = true; - HIPCHECK ( hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16) ); + HIPCHECK(hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16)); A_h = (char*)malloc(sizeElements); HIPASSERT(A_h != NULL); for (size_t i=0; i Date: Tue, 15 Oct 2019 21:47:33 -0400 Subject: [PATCH 10/45] Add -fhip-new-launch-api to hipcc for HIP/VDI --- bin/hipcc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/hipcc b/bin/hipcc index 35fbb54397..77b7b7ebdd 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -234,6 +234,8 @@ if ($HIP_PLATFORM eq "clang") { if ($HIP_RUNTIME eq "HCC" ) { $HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa"; $HIPCXXFLAGS .= " -isystem $HSA_PATH/include"; + } else { + $HIPCXXFLAGS .= " -fhip-new-launch-api"; } } elsif ($HIP_PLATFORM eq "hcc") { From e9adb8e9326ced1d4131ad47516e4973f223d8b0 Mon Sep 17 00:00:00 2001 From: Xiaozhu Meng Date: Wed, 16 Oct 2019 00:18:55 -0500 Subject: [PATCH 11/45] Fix struct declaration for C (#1524) This change is necessary for HPCToolkit to use Roctracer to produce code centric profiling view. --- hip_prof_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hip_prof_gen.py b/hip_prof_gen.py index d1203a64d6..9e90c1558c 100755 --- a/hip_prof_gen.py +++ b/hip_prof_gen.py @@ -348,7 +348,7 @@ def generate_prof_header(f, api_map, opts_map): # Generating the callbacks data structure f.write('\n// HIP API callbacks data structure\n') f.write( - 'struct hip_api_data_t {\n' + + 'typedef struct hip_api_data_t {\n' + ' uint64_t correlation_id;\n' + ' uint32_t phase;\n' + ' union {\n' @@ -364,7 +364,7 @@ def generate_prof_header(f, api_map, opts_map): f.write(' } ' + name + ';\n') f.write( ' } args;\n' + - '};\n' + '} hip_api_data_t;\n' ) # Generating the callbacks args data filling macros From c47c2c4f9a38e59235d92eb1d022cc7c66f43808 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com> Date: Wed, 16 Oct 2019 10:49:16 +0530 Subject: [PATCH 12/45] Adding code object manager to rtc (#1526) Adding Code Object Manager file to rtc to resolve address of Bundled_code_object in libhiprtc.so --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ad40a10ea..28b8683b22 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -289,7 +289,7 @@ if(HIP_PLATFORM STREQUAL "hcc") target_link_libraries(hip_hcc PRIVATE hc_am) target_link_libraries(hip_hcc_static PRIVATE hc_am) - add_library(hiprtc SHARED src/hiprtc.cpp) + add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp) target_include_directories( hiprtc SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/include ${HSA_PATH}/include) From 73ca2b00832b51cdc413e92a337211b37b246c6a Mon Sep 17 00:00:00 2001 From: Nick Curtis Date: Wed, 16 Oct 2019 00:19:56 -0500 Subject: [PATCH 13/45] Guard against division by zero for no VGPR usage (e.g., in an empty kernel) (#1528) * guard against division by zero for no VGPR usage (e.g., in an empty kernel) * fix bracket format * clean up parenthesis --- src/hip_module.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hip_module.cpp b/src/hip_module.cpp index d98b98a378..b3afdd4ffe 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -1326,17 +1326,18 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize; size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU); - size_t vgprs_alu_occupancy = simdPerCU * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS); + size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd + : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS)); // Calculate blocks occupancy per CU based on VGPR usage *numBlocks = vgprs_alu_occupancy / numWavefronts; const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; - size_t sgprs_alu_occupancy = simdPerCU * ((usedSGPRS == 0) ? maxWavesPerSimd + size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); size_t total_used_lds = usedLDS + dynSharedMemPerBlk; if (total_used_lds != 0) { From 596bf4e326f77ff56cda3bf8f8070480c99bb460 Mon Sep 17 00:00:00 2001 From: kpyzhov Date: Wed, 16 Oct 2019 01:21:28 -0400 Subject: [PATCH 14/45] [hipcc] Temporary add -D_OPENMP to clang options to workaround cmake issue (#1540) * Temporary add -D_OPENMP to clang options in hipcc to allow using CMake OpenMP detection with hip-clang (until updated CMake version is available). --- bin/hipcc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/bin/hipcc b/bin/hipcc index 35fbb54397..0f97bf0e21 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -526,6 +526,13 @@ foreach $arg (@ARGV) $optArg = $arg; } + ## This is a temporary workaround for CMake detection of OpenMP support. + ## It should be removed when the OpenMP detection c++ test in CMake is updated + ## and corrected CMake version is available. + if((defined $HIP_COMPILER) and ($HIP_COMPILER eq "clang") and ($arg eq '-fopenmp')) { + $HIPCXXFLAGS .= " -D_OPENMP " + } + ## process linker response file for hip-clang ## extract object files from static library and pass them directly to ## hip-clang in command line. From 2fb734ef5f2711097754a085aa298b3217ceb0f5 Mon Sep 17 00:00:00 2001 From: vsytch Date: Wed, 16 Oct 2019 01:21:48 -0400 Subject: [PATCH 15/45] Update hipMathFunctions, hipTestHalf and hipTestNativeHalf tests to support Navi10 and Navi14. (#1545) --- tests/src/deviceLib/hipMathFunctions.cpp | 7 ++++++- tests/src/deviceLib/hipTestHalf.cpp | 7 ++++++- tests/src/deviceLib/hipTestNativeHalf.cpp | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/tests/src/deviceLib/hipMathFunctions.cpp b/tests/src/deviceLib/hipMathFunctions.cpp index dc064da189..b1b0e8334a 100644 --- a/tests/src/deviceLib/hipMathFunctions.cpp +++ b/tests/src/deviceLib/hipMathFunctions.cpp @@ -29,7 +29,12 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ +#if __HIP_ARCH_GFX803__ || \ + __HIP_ARCH_GFX900__ || \ + __HIP_ARCH_GFX906__ || \ + __HIP_ARCH_GFX908__ || \ + __HIP_ARCH_GFX1010__ || \ + __HIP_ARCH_GFX1012__ __global__ void kernel_abs_int64(long long *input, long long *output) { int tx = threadIdx.x; diff --git a/tests/src/deviceLib/hipTestHalf.cpp b/tests/src/deviceLib/hipTestHalf.cpp index b78e1d8c63..751d44e242 100644 --- a/tests/src/deviceLib/hipTestHalf.cpp +++ b/tests/src/deviceLib/hipTestHalf.cpp @@ -28,7 +28,12 @@ THE SOFTWARE. #include "test_common.h" -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ +#if __HIP_ARCH_GFX803__ || \ + __HIP_ARCH_GFX900__ || \ + __HIP_ARCH_GFX906__ || \ + __HIP_ARCH_GFX908__ || \ + __HIP_ARCH_GFX1010__ || \ + __HIP_ARCH_GFX1012__ __device__ void test_convert() { __half x; diff --git a/tests/src/deviceLib/hipTestNativeHalf.cpp b/tests/src/deviceLib/hipTestNativeHalf.cpp index 55213d446d..8639127c7c 100644 --- a/tests/src/deviceLib/hipTestNativeHalf.cpp +++ b/tests/src/deviceLib/hipTestNativeHalf.cpp @@ -32,7 +32,12 @@ THE SOFTWARE. using namespace std; -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ +#if __HIP_ARCH_GFX803__ || \ + __HIP_ARCH_GFX900__ || \ + __HIP_ARCH_GFX906__ || \ + __HIP_ARCH_GFX908__ || \ + __HIP_ARCH_GFX1010__ || \ + __HIP_ARCH_GFX1012__ __global__ void __halfTest(bool* result, __half a) { From fbb98f49508361c039dc1cd01bb7234ce7e1bf7c Mon Sep 17 00:00:00 2001 From: kjayapra-amd <54370791+kjayapra-amd@users.noreply.github.com> Date: Wed, 16 Oct 2019 01:22:15 -0400 Subject: [PATCH 16/45] Use the correct return type in runTest in 11_texture_driver sample. (#1546) Fixes SWDEV-203394. Currently in runTest() returns true, even if the texture reference copy does not happen. Using the existing testResult Flag to return from runTest(). --- samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) mode change 100644 => 100755 samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp diff --git a/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp b/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp old mode 100644 new mode 100755 index 2cb9877cac..b3c1ef5d0c --- a/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp +++ b/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define fileName "tex2dKernel.code" texture tex; -bool testResult = false; +bool testResult = true; #define HIP_CHECK(cmd) \ { \ @@ -126,7 +126,7 @@ bool runTest(int argc, char** argv) { } hipFree(dData); hipFreeArray(array); - return true; + return testResult; } int main(int argc, char** argv) { From 6960574850c9dfe065cb383094031101af0f8d79 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 16 Oct 2019 13:08:11 +0300 Subject: [PATCH 17/45] [HIPIFY][CUB][#1460] Implement cubFunctionTemplateDecl matcher + Add cub_02.cu test + Partial fixes #1460 --- hipify-clang/src/HipifyAction.cpp | 44 ++++++++++++ hipify-clang/src/HipifyAction.h | 1 + .../unit_tests/libraries/CUB/cub_02.cu | 70 +++++++++++++++++++ 3 files changed, 115 insertions(+) create mode 100644 tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp index 930f3ec8c5..60288e7500 100644 --- a/hipify-clang/src/HipifyAction.cpp +++ b/hipify-clang/src/HipifyAction.cpp @@ -61,6 +61,7 @@ const StringRef sCudaLaunchKernel = "cudaLaunchKernel"; const StringRef sCudaHostFuncCall = "cudaHostFuncCall"; const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall"; const StringRef sCubNamespacePrefix = "cubNamespacePrefix"; +const StringRef sCubFunctionTemplateDecl = "cubFunctionTemplateDecl"; std::set DeviceSymbolFunctions0 { {sCudaMemcpyToSymbol}, @@ -449,6 +450,41 @@ bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Resul return false; } +bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result) { + if (auto *decl = Result.Nodes.getNodeAs(sCubFunctionTemplateDecl)) { + auto *Tparams = decl->getTemplateParameters(); + bool ret = false; + for (size_t I = 0; I < Tparams->size(); ++I) { + const clang::ValueDecl *valueDecl = dyn_cast(Tparams->getParam(I)); + if (!valueDecl) continue; + clang::QualType QT = valueDecl->getType(); + auto *t = QT.getTypePtr(); + if (!t) continue; + const clang::ElaboratedType *et = t->getAs(); + if (!et) continue; + const clang::NestedNameSpecifier *nns = et->getQualifier(); + if (!nns) continue; + const clang::NamespaceDecl *nsd = nns->getAsNamespace(); + if (!nsd) continue; + const clang::SourceRange sr = valueDecl->getSourceRange(); + clang::SourceLocation sl(sr.getBegin()); + clang::SourceLocation end(sr.getEnd()); + auto &SM = getCompilerInstance().getSourceManager(); + size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); + StringRef sfull = StringRef(SM.getCharacterData(sl), length); + std::string name = nsd->getDeclName().getAsString(); + size_t offset = sfull.find(name); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP); + ret = true; + } + return ret; + } + return false; +} + bool HipifyAction::cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result) { if (auto *call = Result.Nodes.getNodeAs(sCudaHostFuncCall)) { if (!call->getNumArgs()) return false; @@ -555,6 +591,13 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi ).bind(sCubNamespacePrefix), this ); + // TODO: Maybe worth to make it more concrete based on final cubFunctionTemplateDecl + Finder->addMatcher( + mat::functionTemplateDecl( + mat::isExpansionInMainFile() + ).bind(sCubFunctionTemplateDecl), + this + ); // Ownership is transferred to the caller. return Finder->newASTConsumer(); } @@ -668,4 +711,5 @@ void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) { if (cudaHostFuncCall(Result)) return; if (cudaDeviceFuncCall(Result)) return; if (cubNamespacePrefix(Result)) return; + if (cubFunctionTemplateDecl(Result)) return; } diff --git a/hipify-clang/src/HipifyAction.h b/hipify-clang/src/HipifyAction.h index 78532449ee..38a0deba9e 100644 --- a/hipify-clang/src/HipifyAction.h +++ b/hipify-clang/src/HipifyAction.h @@ -73,6 +73,7 @@ public: bool cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result); bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result); bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result); + bool cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result); // Called by the preprocessor for each include directive during the non-raw lexing pass. void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token, diff --git a/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu b/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu new file mode 100644 index 0000000000..aff5def3fa --- /dev/null +++ b/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu @@ -0,0 +1,70 @@ +// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args +// CHECK: #include +#include +// CHECK: #include +#include +// CHECK: #include +#include + +#include + +template +__global__ void sort(const T* data_in, T* data_out){ + // CHECK: typedef hipcub::BlockLoad BlockLoadT; + typedef cub::BlockLoad BlockLoadT; + // CHECK: typedef hipcub::BlockRadixSort BlockRadixSortT; + typedef cub::BlockRadixSort BlockRadixSortT; + // CHECK: typedef hipcub::BlockStore BlockStoreT; + typedef cub::BlockStore BlockStoreT; + __shared__ union { + typename BlockLoadT::TempStorage load; + typename BlockRadixSortT::TempStorage sort; + typename BlockStoreT::TempStorage store; + } tmp_storage; + T items[ITEMS_PER_THREAD]; + BlockLoadT(tmp_storage.load).Load(data_in + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items); + __syncthreads(); + BlockRadixSortT(tmp_storage.sort).Sort(items); + __syncthreads(); + BlockStoreT(tmp_storage.store).Store(data_out + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items); +} + +int main() { + double* d_gpu = NULL; + double* result_gpu = NULL; + double* data_sorted = new double[1000*4096]; + // Allocate memory on the GPU + // CHECK: hipMalloc(&d_gpu, 1000*4096 * sizeof(double)); + cudaMalloc(&d_gpu, 1000*4096 * sizeof(double)); + // CHECK: hipMalloc(&result_gpu, 1000*4096 * sizeof(double)); + cudaMalloc(&result_gpu, 1000*4096 * sizeof(double)); + // CHECK: hiprandGenerator_t gen; + curandGenerator_t gen; + // Create generator + // CHECK: hiprandCreateGenerator(&gen, HIPRAND_RNG_PSEUDO_DEFAULT); + curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); + // Fill array with random numbers + // CHECK: hiprandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0); + curandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0); + // Destroy generator + // CHECK: hiprandDestroyGenerator(gen); + curandDestroyGenerator(gen); + // Sort data + // TODO: Substitution of cub namespace in CUDAKernelCallExpr + // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu); + sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE><<<1000, 512>>>(d_gpu, result_gpu); + // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu); + sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT><<<1000, 256>>>(d_gpu, result_gpu); + // CHECK: hipMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), hipMemcpyDeviceToHost); + cudaMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), cudaMemcpyDeviceToHost); + // Write the sorted data to standard out + for (int i = 0; i < 4095; ++i) { + std::cout << data_sorted[i] << ", "; + } + std::cout << data_sorted[4095] << std::endl; +} From 809a67a4f66fe86a6913d0343e74075d68bd41c8 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 16 Oct 2019 13:43:56 +0300 Subject: [PATCH 18/45] [HIPIFY] Refactor a couple of matcher functions + Separate out GetSubstrLocation function for finding substr SourceLocation in a given SourceRange --- hipify-clang/src/HipifyAction.cpp | 35 +++++++++++++------------------ hipify-clang/src/HipifyAction.h | 2 ++ 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp index 60288e7500..269e66fed1 100644 --- a/hipify-clang/src/HipifyAction.cpp +++ b/hipify-clang/src/HipifyAction.cpp @@ -111,6 +111,19 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { } } +clang::SourceLocation HipifyAction::GetSubstrLocation(const std::string &str, const clang::SourceRange &sr) { + clang::SourceLocation sl(sr.getBegin()); + clang::SourceLocation end(sr.getEnd()); + auto &SM = getCompilerInstance().getSourceManager(); + size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); + StringRef sfull = StringRef(SM.getCharacterData(sl), length); + size_t offset = sfull.find(str); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + return sl; +} + /** * Look at, and consider altering, a given token. * @@ -434,17 +447,8 @@ bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Resul const clang::TypeSourceInfo *si = decl->getTypeSourceInfo(); const clang::TypeLoc tloc = si->getTypeLoc(); const clang::SourceRange sr = tloc.getSourceRange(); - clang::SourceLocation sl(sr.getBegin()); - clang::SourceLocation end(sr.getEnd()); - auto &SM = getCompilerInstance().getSourceManager(); - size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); - StringRef sfull = StringRef(SM.getCharacterData(sl), length); std::string name = nsd->getDeclName().getAsString(); - size_t offset = sfull.find(name); - if (offset > 0) { - sl = sl.getLocWithOffset(offset); - } - FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP); + FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP); return true; } return false; @@ -467,17 +471,8 @@ bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult & const clang::NamespaceDecl *nsd = nns->getAsNamespace(); if (!nsd) continue; const clang::SourceRange sr = valueDecl->getSourceRange(); - clang::SourceLocation sl(sr.getBegin()); - clang::SourceLocation end(sr.getEnd()); - auto &SM = getCompilerInstance().getSourceManager(); - size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); - StringRef sfull = StringRef(SM.getCharacterData(sl), length); std::string name = nsd->getDeclName().getAsString(); - size_t offset = sfull.find(name); - if (offset > 0) { - sl = sl.getLocWithOffset(offset); - } - FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP); + FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP); ret = true; } return ret; diff --git a/hipify-clang/src/HipifyAction.h b/hipify-clang/src/HipifyAction.h index 38a0deba9e..73879bfd14 100644 --- a/hipify-clang/src/HipifyAction.h +++ b/hipify-clang/src/HipifyAction.h @@ -63,6 +63,8 @@ private: void RewriteString(StringRef s, clang::SourceLocation start); // Replace a CUDA identifier with the corresponding hip identifier, if applicable. void RewriteToken(const clang::Token &t); + // Calculate str's SourceLocation in SourceRange sr + clang::SourceLocation GetSubstrLocation(const std::string &str, const clang::SourceRange &sr); public: explicit HipifyAction(ct::Replacements *replacements): clang::ASTFrontendAction(), From c747b77ac1d8e495d2ad72f795adb46bf1615d69 Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Wed, 16 Oct 2019 11:02:38 -0400 Subject: [PATCH 19/45] hipMemset2D and hipMemset3D tests should be passing by default. --- tests/src/runtimeApi/memory/hipMemset2D.cpp | 2 +- tests/src/runtimeApi/memory/hipMemset3D.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemset2D.cpp b/tests/src/runtimeApi/memory/hipMemset2D.cpp index 449f0b6f78..2eb62a859f 100644 --- a/tests/src/runtimeApi/memory/hipMemset2D.cpp +++ b/tests/src/runtimeApi/memory/hipMemset2D.cpp @@ -113,7 +113,7 @@ int main(int argc, char *argv[]) hipCtx_t context; hipCtxCreate(&context, 0, p_gpuDevice); - bool testResult = false; + bool testResult = true; testResult &= testhipMemset2D(memsetval, p_gpuDevice); testResult &= testhipMemset2DAsync(memsetval, p_gpuDevice); hipCtxDestroy(context); diff --git a/tests/src/runtimeApi/memory/hipMemset3D.cpp b/tests/src/runtimeApi/memory/hipMemset3D.cpp index a47b609c73..1917559f2a 100644 --- a/tests/src/runtimeApi/memory/hipMemset3D.cpp +++ b/tests/src/runtimeApi/memory/hipMemset3D.cpp @@ -134,7 +134,7 @@ int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); HIPCHECK(hipSetDevice(p_gpuDevice)); - bool testResult = false; + bool testResult = true; testResult &= testhipMemset3D(memsetval, p_gpuDevice); testResult &= testhipMemset3DAsync(memsetval, p_gpuDevice); if (testResult) { From edfd05a86d683659d2d701bbfaefbba6054179d1 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 16 Oct 2019 19:02:13 +0300 Subject: [PATCH 20/45] [HIPIFY][CUB][#1460] Add cub:: namespace support in TemplateInstantiation of cudaLaunchKernel + Update cub_02.cu test accordingly --- hipify-clang/src/HipifyAction.cpp | 22 +++++++++++++++++-- .../unit_tests/libraries/CUB/cub_02.cu | 5 ++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp index 269e66fed1..f63da1b2de 100644 --- a/hipify-clang/src/HipifyAction.cpp +++ b/hipify-clang/src/HipifyAction.cpp @@ -36,6 +36,7 @@ THE SOFTWARE. const std::string sHIP = "HIP"; const std::string sROC = "ROC"; const std::string sCub = "cub"; +const std::string sHipcub = "hipcub"; const std::string sHIP_DYNAMIC_SHARED = "HIP_DYNAMIC_SHARED"; const std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME"; std::string sHIP_SYMBOL = "HIP_SYMBOL"; @@ -345,9 +346,26 @@ bool HipifyAction::cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result) llvm::raw_svector_ostream OS(XStr); clang::LangOptions DefaultLangOptions; auto *SM = Result.SourceManager; + clang::SourceRange sr = calleeExpr->getSourceRange(); + std::string kern = readSourceText(*SM, sr).str(); OS << sHipLaunchKernelGGL << "("; - if (caleeDecl->isTemplateInstantiation()) OS << sHIP_KERNEL_NAME << "("; - OS << readSourceText(*SM, calleeExpr->getSourceRange()); + if (caleeDecl->isTemplateInstantiation()) { + OS << sHIP_KERNEL_NAME << "("; + std::string cub = sCub + "::"; + std::string hipcub; + const auto found = CUDA_CUB_TYPE_NAME_MAP.find(sCub); + if (found != CUDA_CUB_TYPE_NAME_MAP.end()) { + hipcub = found->second.hipName.str() + "::"; + } else { + hipcub = sHipcub + "::"; + } + size_t pos = kern.find(cub); + while (pos != std::string::npos) { + kern.replace(pos, cub.size(), hipcub); + pos = kern.find(cub, pos + hipcub.size()); + } + } + OS << kern; if (caleeDecl->isTemplateInstantiation()) OS << ")"; OS << ", "; // Next up are the four kernel configuration parameters, the last two of which are optional and default to zero. diff --git a/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu b/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu index aff5def3fa..21898baa03 100644 --- a/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu +++ b/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu @@ -55,10 +55,9 @@ int main() { // CHECK: hiprandDestroyGenerator(gen); curandDestroyGenerator(gen); // Sort data - // TODO: Substitution of cub namespace in CUDAKernelCallExpr - // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu); + // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, hipcub::BLOCK_LOAD_TRANSPOSE, hipcub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu); sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE><<<1000, 512>>>(d_gpu, result_gpu); - // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu); + // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, hipcub::BLOCK_LOAD_DIRECT, hipcub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu); sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT><<<1000, 256>>>(d_gpu, result_gpu); // CHECK: hipMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), hipMemcpyDeviceToHost); cudaMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), cudaMemcpyDeviceToHost); From f19e7c29dfcef3d0c941c977ce6d1448ad4e288a Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 17 Oct 2019 15:05:55 +0300 Subject: [PATCH 21/45] [HIPIFY][cmake] Add install rule for clang-resource-headers + Fix: set destination for all installing files to ${CMAKE_INSTALL_PREFIX} --- hipify-clang/CMakeLists.txt | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt index fd172a9c8d..bf4ef6872d 100644 --- a/hipify-clang/CMakeLists.txt +++ b/hipify-clang/CMakeLists.txt @@ -80,7 +80,21 @@ endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CFLAGS} ${StdCpp} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DIRS}/clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}\\\"") -install(TARGETS hipify-clang DESTINATION bin) +install(TARGETS hipify-clang DESTINATION ${CMAKE_INSTALL_PREFIX}) + +install( + DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/ + DESTINATION ${CMAKE_INSTALL_PREFIX} + COMPONENT clang-resource-headers + FILES_MATCHING + PATTERN "*.h" + PATTERN "*.modulemap" + PATTERN "algorithm" + PATTERN "complex" + PATTERN "new" + PATTERN "ppc_wrappers" EXCLUDE + PATTERN "openmp_wrappers" EXCLUDE + ) if (HIPIFY_CLANG_TESTS) find_package(PythonInterp 2.7 REQUIRED) From 5f37f3174a3b17875405d6eefd6ad00b77adf91a Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 17 Oct 2019 09:11:32 -0700 Subject: [PATCH 22/45] Revert "hipcc defaults to code object v3 (#1298)" This reverts commit d39a2a0749e4b3deffd2717dc8209d668d5f9871. --- bin/hipcc | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/bin/hipcc b/bin/hipcc index 0f97bf0e21..3686d51abe 100755 --- a/bin/hipcc +++ b/bin/hipcc @@ -352,7 +352,6 @@ my $runCmd = 1; my $buildDeps = 0; my $linkType = 1; my $setLinkType = 0; -my $coFormatv3 = 1; my @options = (); my @inputs = (); @@ -472,22 +471,6 @@ foreach $arg (@ARGV) $swallowArg = 1; } - # code object format parsing - if ($trimarg eq '-mcode-object-v3') { - $coFormatv3 = 1; - # hip-clang already recognizes -mcode-object-v3, so we just pass it on - if ($HIP_PLATFORM eq 'hcc') { - $swallowArg = 1; - } - } - if ($trimarg eq '-mno-code-object-v3') { - $coFormatv3 = 0; - # hip-clang already recognizes -mno-code-object-v3, so we just pass it on - if ($HIP_PLATFORM eq 'hcc') { - $swallowArg = 1; - } - } - if (($arg =~ /--genco/) and $HIP_PLATFORM eq 'clang' ) { $arg = "--cuda-device-only"; } @@ -870,12 +853,6 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){ } } -# hcc defaults to v2, so we need to convert to the appropriate flag -# hip-clang defaults to v3, so we don't need to do anything -if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') { - $HIPLDFLAGS .= " -Wl,-hcc-cov3 "; -} - if ($hasC and $HIP_PLATFORM eq 'nvcc') { $HIPCXXFLAGS .= " -x cu"; } From 7ecbd7100419998bbb7fad16909848995c9364f5 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 17 Oct 2019 22:26:48 +0300 Subject: [PATCH 23/45] [HIPIFY][doc] Update README.md + Versions, testing --- hipify-clang/README.md | 155 +++++++++++++++++++++-------------------- 1 file changed, 78 insertions(+), 77 deletions(-) diff --git a/hipify-clang/README.md b/hipify-clang/README.md index bb9654d8b4..f53a87f95c 100644 --- a/hipify-clang/README.md +++ b/hipify-clang/README.md @@ -154,7 +154,7 @@ To run it: * Path to CUB should be specified by the `CUDA_CUB_ROOT_DIR` option: - - Linux: `-DCUDA_CUB_ROOT_DIR=/srv/CUB` + - Linux: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB` - Windows: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub` @@ -194,9 +194,9 @@ Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.4.38 Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38 -Build system for the above configurations: +Minimum build system requirements for the above configurations: -Python 2.7 (min), cmake 3.5.2 (min), GNU C/C++ 5.4.0 (min). +Python 2.7, cmake 3.5.2, GNU C/C++ 5.4.0. Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`: @@ -208,7 +208,7 @@ cmake -DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.0/dist \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \ -DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.4.38 \ - -DCUDA_CUB_ROOT_DIR=/srv/CUB \ + -DCUDA_CUB_ROOT_DIR=/srv/git/CUB \ -DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.0/build/bin/llvm-lit \ .. ``` @@ -264,88 +264,89 @@ Linux 5.2.0 - Platform OS 64 - hipify-clang binary bitness 64 - python 2.7.12 binary bitness ======================================== --- Testing: 64 tests, 12 threads -- -PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 64) -PASS: hipify :: unit_tests/device/math_functions.cu (2 of 64) -PASS: hipify :: unit_tests/device/atomics.cu (3 of 64) -PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 64) -PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 64) -PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 64) -PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 64) -PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 64) -PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 64) -PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 64) -PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 64) -PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 64) -PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 64) -PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 64) -PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 64) -PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 64) -PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 64) -PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 64) -PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 64) -PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 64) -PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 64) -PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 64) -PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 64) -PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 64) -PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 64) -PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 64) -PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 64) -PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 64) -PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 64) -PASS: hipify :: unit_tests/samples/allocators.cu (56 of 64) -PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 64) -PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 64) -PASS: hipify :: unit_tests/samples/axpy.cu (59 of 64) -PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 64) -PASS: hipify :: unit_tests/samples/intro.cu (61 of 64) -PASS: hipify :: unit_tests/samples/square.cu (62 of 64) -PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 64) -PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 64) -Testing Time: 2.98s - Expected Passes : 64 +-- Testing: 65 tests, 12 threads -- +PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 65) +PASS: hipify :: unit_tests/device/math_functions.cu (2 of 65) +PASS: hipify :: unit_tests/device/atomics.cu (3 of 65) +PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 65) +PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 65) +PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 65) +PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 65) +PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 65) +PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 65) +PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 65) +PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 65) +PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 65) +PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 65) +PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 65) +PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 65) +PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 65) +PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 65) +PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 65) +PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 65) +PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 65) +PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 65) +PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 65) +PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 65) +PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 65) +PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 65) +PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 65) +PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 65) +PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 65) +PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 65) +PASS: hipify :: unit_tests/samples/allocators.cu (56 of 65) +PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 65) +PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 65) +PASS: hipify :: unit_tests/samples/axpy.cu (59 of 65) +PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 65) +PASS: hipify :: unit_tests/samples/intro.cu (61 of 65) +PASS: hipify :: unit_tests/samples/square.cu (62 of 65) +PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 65) +PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 65) +PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (18 of 65) +Testing Time: 3.01s + Expected Passes : 65 [100%] Built target test-hipify ``` ### Windows On Windows 10 the following configurations are tested: -LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn-5.1.10 - cudnn-7.1.4.18 +LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18 -LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.6.4.38 +LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.4.38 -LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn-7.0.5.15 - cudnn-7.6.4.38 +LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn 7.0.5.15 - 7.6.4.38 -Build system for the above configurations: +Build system requirements for the latest configuration LLVM 9.0.0/CUDA 10.1 Update 2: -Python 3.6 - 3.7.4, cmake 3.5.2 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4). +Python 3.6.0 - 3.8.0, cmake 3.5.2 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5). Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`: @@ -371,7 +372,7 @@ cmake -- - CMake module path: F:/LLVM/9.0.0/dist/lib/cmake/llvm -- - Include path : F:/LLVM/9.0.0/dist/include -- - Binary path : F:/LLVM/9.0.0/dist/bin --- Found PythonInterp: C:/Program Files/Python37/python.exe (found suitable version "3.7.4", minimum required is "3.6") +-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.0", minimum required is "3.6") -- Found lit: C:/Program Files/Python36/Scripts/lit.exe -- Found FileCheck: F:/LLVM/9.0.0/dist/bin/FileCheck.exe -- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1") From 31e57f8b64c3a4b8768aa67ab87ab26cf3003abb Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Thu, 17 Oct 2019 21:21:24 +0000 Subject: [PATCH 24/45] [HIPIFY][cmake] Make CMakeLists use default 3.5.1 for Ubuntu 16.04 --- hipify-clang/CMakeLists.txt | 2 +- hipify-clang/README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt index bf4ef6872d..25076a8895 100644 --- a/hipify-clang/CMakeLists.txt +++ b/hipify-clang/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.5.2) +cmake_minimum_required(VERSION 3.5.1) project(hipify-clang) diff --git a/hipify-clang/README.md b/hipify-clang/README.md index f53a87f95c..da5abc19da 100644 --- a/hipify-clang/README.md +++ b/hipify-clang/README.md @@ -196,7 +196,7 @@ Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38 Minimum build system requirements for the above configurations: -Python 2.7, cmake 3.5.2, GNU C/C++ 5.4.0. +Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0. Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`: @@ -346,7 +346,7 @@ LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn 7.0.5.15 - 7.6.4.38 Build system requirements for the latest configuration LLVM 9.0.0/CUDA 10.1 Update 2: -Python 3.6.0 - 3.8.0, cmake 3.5.2 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5). +Python 3.6.0 - 3.8.0, cmake 3.5.1 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5). Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`: From 8f0a2266600e68a2e4473ad99d79e8e65ae0cef3 Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Thu, 17 Oct 2019 18:58:32 -0400 Subject: [PATCH 25/45] _aligned_malloc() on Windows first takes size, then alignment, which is the opposite of how the similar function behaves on Linux. Memory allocated by it also has to be freed using _aligned_free(), unlike Linux where we can use regular free(). Edit aligned_alloc() macro and add a aligned_free() one to align with the above behaviour. --- tests/src/runtimeApi/memory/hipMemcpy_simple.cpp | 4 ++-- tests/src/test_common.h | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp index 8ce02d6164..4aacfa866d 100644 --- a/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp +++ b/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp @@ -120,8 +120,8 @@ void simpleTest2(size_t numElements, bool usePinnedHost) { HIPCHECK(hipHostFree(A_h1)); HIPCHECK(hipHostFree(A_h2)); } else { - free(A_h1); - free(A_h2); + aligned_free(A_h1); + aligned_free(A_h2); } } diff --git a/tests/src/test_common.h b/tests/src/test_common.h index 73a952b0d1..dd77cd64a3 100644 --- a/tests/src/test_common.h +++ b/tests/src/test_common.h @@ -99,11 +99,15 @@ THE SOFTWARE. #ifdef _WIN64 #include -#define aligned_alloc _aligned_malloc +#define aligned_alloc(x,y) _aligned_malloc(y,x) +#define aligned_free(x) _aligned_free(x) #define popen(x,y) _popen(x,y) #define pclose(x) _pclose(x) #define setenv(x,y,z) _putenv_s(x,y) #endif +#else +#define aligned_free(x) free(x) +#endif // standard command-line variables: extern size_t N; From 98874c0e7fa98f1df6f927c28378fb11a58d6f7c Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 18 Oct 2019 18:51:40 +0300 Subject: [PATCH 26/45] [HIPIFY][CUB][#1460] Add "using namespace cub" translation support + Add cub_03.cu --- hipify-clang/src/HipifyAction.cpp | 19 ++++++++++ hipify-clang/src/HipifyAction.h | 1 + .../unit_tests/libraries/CUB/cub_03.cu | 37 +++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu diff --git a/hipify-clang/src/HipifyAction.cpp b/hipify-clang/src/HipifyAction.cpp index f63da1b2de..510d91978a 100644 --- a/hipify-clang/src/HipifyAction.cpp +++ b/hipify-clang/src/HipifyAction.cpp @@ -63,6 +63,7 @@ const StringRef sCudaHostFuncCall = "cudaHostFuncCall"; const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall"; const StringRef sCubNamespacePrefix = "cubNamespacePrefix"; const StringRef sCubFunctionTemplateDecl = "cubFunctionTemplateDecl"; +const StringRef sCubUsingNamespaceDecl = "cubUsingNamespaceDecl"; std::set DeviceSymbolFunctions0 { {sCudaMemcpyToSymbol}, @@ -472,6 +473,16 @@ bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Resul return false; } +bool HipifyAction::cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result) { + if (auto *decl = Result.Nodes.getNodeAs(sCubUsingNamespaceDecl)) { + if (auto nsd = decl->getNominatedNamespace()) { + FindAndReplace(nsd->getDeclName().getAsString(), decl->getIdentLocation(), CUDA_CUB_TYPE_NAME_MAP); + return true; + } + } + return false; +} + bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result) { if (auto *decl = Result.Nodes.getNodeAs(sCubFunctionTemplateDecl)) { auto *Tparams = decl->getTemplateParameters(); @@ -611,6 +622,13 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi ).bind(sCubFunctionTemplateDecl), this ); + // TODO: Maybe worth to make it more concrete + Finder->addMatcher( + mat::usingDirectiveDecl( + mat::isExpansionInMainFile() + ).bind(sCubUsingNamespaceDecl), + this + ); // Ownership is transferred to the caller. return Finder->newASTConsumer(); } @@ -725,4 +743,5 @@ void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) { if (cudaDeviceFuncCall(Result)) return; if (cubNamespacePrefix(Result)) return; if (cubFunctionTemplateDecl(Result)) return; + if (cubUsingNamespaceDecl(Result)) return; } diff --git a/hipify-clang/src/HipifyAction.h b/hipify-clang/src/HipifyAction.h index 73879bfd14..f70d17dd0b 100644 --- a/hipify-clang/src/HipifyAction.h +++ b/hipify-clang/src/HipifyAction.h @@ -76,6 +76,7 @@ public: bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result); bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result); bool cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result); + bool cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result); // Called by the preprocessor for each include directive during the non-raw lexing pass. void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token, diff --git a/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu b/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu new file mode 100644 index 0000000000..8f68bb40c6 --- /dev/null +++ b/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu @@ -0,0 +1,37 @@ +// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args +// CHECK: #include +#include +// CHECK: #include +#include +// CHECK: #include +#include + +#include + +// using namespace hipcub; +using namespace cub; + +/** +* Simple CUDA kernel for computing tiled partial sums +*/ +template +__global__ void ScanTilesKernel(int *d_in, int *d_out) { + // Specialize collective types for problem context + // TODO: typedef cub::BlockLoad BlockLoadT; + typedef BlockLoad BlockLoadT; + typedef BlockScan BlockScanT; + // Allocate on-chip temporary storage + __shared__ union { + typename BlockLoadT::TempStorage load; + typename BlockScanT::TempStorage reduce; + } temp_storage; + // Load data per thread + int thread_data[ITEMS_PER_THREAD]; + int offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD); + BlockLoadT(temp_storage.load).Load(d_in + offset, offset); + __syncthreads(); + // Compute the block-wide prefix sum + BlockScanT(temp_storage).Sum(thread_data); +} From 82adc93e69c0a6104d586ce471c21649306c3fc4 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 18 Oct 2019 18:55:52 +0300 Subject: [PATCH 27/45] [HIPIFY][tests] Test clean-up --- tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu b/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu index 8f68bb40c6..9fdbc17515 100644 --- a/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu +++ b/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu @@ -6,14 +6,10 @@ // CHECK: #include #include -#include - // using namespace hipcub; using namespace cub; -/** -* Simple CUDA kernel for computing tiled partial sums -*/ +// Simple CUDA kernel for computing tiled partial sums template From 664b115c44b98f23236b238985e3304ac4a1708b Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Fri, 18 Oct 2019 16:40:29 -0400 Subject: [PATCH 28/45] Remove extra #endif. --- tests/src/test_common.h | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/src/test_common.h b/tests/src/test_common.h index dd77cd64a3..67a8e5e60a 100644 --- a/tests/src/test_common.h +++ b/tests/src/test_common.h @@ -104,7 +104,6 @@ THE SOFTWARE. #define popen(x,y) _popen(x,y) #define pclose(x) _pclose(x) #define setenv(x,y,z) _putenv_s(x,y) -#endif #else #define aligned_free(x) free(x) #endif From ccb075b1db8bf502a3c8c3e10d970b8bcce6d1fd Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Sun, 20 Oct 2019 20:03:18 +0300 Subject: [PATCH 29/45] [HIPIFY][tests] Exclude all CUB tests if CUDA_CUB_ROOT_DIR is not set --- tests/hipify-clang/lit.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/hipify-clang/lit.cfg b/tests/hipify-clang/lit.cfg index 89e17249f0..0a40dda623 100644 --- a/tests/hipify-clang/lit.cfg +++ b/tests/hipify-clang/lit.cfg @@ -33,6 +33,8 @@ if not config.cuda_dnn_root: warns = True if not config.cuda_cub_root: config.excludes.append('cub_01.cu') + config.excludes.append('cub_02.cu') + config.excludes.append('cub_03.cu') print("WARN: CUB tests are excluded due to unset CUDA_CUB_ROOT_DIR") warns = True if warns: From 6cfea9b600c0e87049ad441090dbe89e804ec38a Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Sun, 20 Oct 2019 20:08:56 +0300 Subject: [PATCH 30/45] [HIPIFY][tests] Set -I for CUDA path instead of --cuda-path for LLVM < 4 --- tests/hipify-clang/lit.cfg | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/hipify-clang/lit.cfg b/tests/hipify-clang/lit.cfg index 0a40dda623..c9c5b83bd3 100644 --- a/tests/hipify-clang/lit.cfg +++ b/tests/hipify-clang/lit.cfg @@ -124,7 +124,11 @@ elif config.cuda_cub_root: else: config.substitutions.append(("%clang_args", clang_arguments % config.cuda_sdk_root)) -hipify_arguments = "--cuda-path='%s'" +if config.llvm_version_major < 4: + hipify_arguments = "-I'%s'/include" +else: + hipify_arguments = "--cuda-path='%s'" + config.substitutions.append(("%hipify_args", hipify_arguments % config.cuda_root)) config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"')) config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"')) From 14b4df126c99aee0734bbe9b3c12ed0f343e5f68 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 21 Oct 2019 15:51:25 +0300 Subject: [PATCH 31/45] [HIPIFY][tests] Set max clang's CudaArch for corresponding CUDA version [Reason] To support maximum CUDA features in offline tests + Add CUDA_VERSION >= 800 restriction for atomics.cu [TODO] Find a way to use or exclude atomicAdd for doubles if LLVM < 7, because LLVM 6.0.1 and older do not use --cuda-gpu-arch in clang's Driver code at all (option is only declared) --- tests/hipify-clang/lit.cfg | 11 +++++++++++ tests/hipify-clang/unit_tests/device/atomics.cu | 2 ++ 2 files changed, 13 insertions(+) diff --git a/tests/hipify-clang/lit.cfg b/tests/hipify-clang/lit.cfg index c9c5b83bd3..79be2b3d39 100644 --- a/tests/hipify-clang/lit.cfg +++ b/tests/hipify-clang/lit.cfg @@ -108,6 +108,17 @@ else: if config.pointer_size == 8: clang_arguments += " -D__LP64__" +# Set max clang's CudaArch for corresponding CUDA version +# to support maximum CUDA features in offline tests +if config.cuda_version_major == 7: + clang_arguments += " --cuda-gpu-arch=sm_52" +if config.cuda_version_major == 8: + clang_arguments += " --cuda-gpu-arch=sm_62" +if config.cuda_version_major == 9: + clang_arguments += " --cuda-gpu-arch=sm_70" +if config.cuda_version_major == 10: + clang_arguments += " --cuda-gpu-arch=sm_75" + # cuDNN ROOT if config.cuda_dnn_root: clang_arguments += " -I'%s'/include" diff --git a/tests/hipify-clang/unit_tests/device/atomics.cu b/tests/hipify-clang/unit_tests/device/atomics.cu index 1afd1ab541..e24b9a2175 100644 --- a/tests/hipify-clang/unit_tests/device/atomics.cu +++ b/tests/hipify-clang/unit_tests/device/atomics.cu @@ -276,7 +276,9 @@ int main(int argc, char** argv) { runTest(); runTest(); runTest(); +#if CUDA_VERSION >= 8000 runTest(); +#endif // CHECK: hipDeviceReset(); cudaDeviceReset(); printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); From b08f29a6fa5c052b7234f9d8c75c44df0924376c Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 21 Oct 2019 17:15:05 +0300 Subject: [PATCH 32/45] [HIPIFY][perl] Support of 'using namespace cub' --- bin/hipify-perl | 7 +++++++ hipify-clang/src/CUDA2HIP_Perl.cpp | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/bin/hipify-perl b/bin/hipify-perl index 2e391ab8d2..09c8d2a139 100755 --- a/bin/hipify-perl +++ b/bin/hipify-perl @@ -1676,6 +1676,12 @@ sub transformKernelLaunch { } } +sub transformCubNamespace { + my $k = 0; + $k += s/using\s*namespace\s*cub/using namespace hipcub/g; + return $k; +} + sub transformHostFunctions { my $k = 0; foreach $func ( @@ -2368,6 +2374,7 @@ while (@ARGV) { simpleSubstitutions(); transformExternShared(); transformKernelLaunch(); + transformCubNamespace(); if ($print_stats) { while (/(\b(hip|HIP)([A-Z]|_)\w+\b)/g) { $convertedTags{$1}++; diff --git a/hipify-clang/src/CUDA2HIP_Perl.cpp b/hipify-clang/src/CUDA2HIP_Perl.cpp index df83bf8d57..8d59089d4f 100644 --- a/hipify-clang/src/CUDA2HIP_Perl.cpp +++ b/hipify-clang/src/CUDA2HIP_Perl.cpp @@ -251,6 +251,11 @@ namespace perl { *streamPtr.get() << tab_2 << "$Tkernels{$1}++;" << endl_tab << "}" << endl << "}" << endl; } + void generateCubNamespace(unique_ptr& streamPtr) { + *streamPtr.get() << endl << sub << "transformCubNamespace" << " {" << endl_tab << my_k << endl; + *streamPtr.get() << tab << "$k += s/using\\s*namespace\\s*cub/using namespace hipcub/g;" << endl << tab << return_k << "}" << endl; + } + void generateHostFunctions(unique_ptr& streamPtr) { *streamPtr.get() << endl << sub << "transformHostFunctions" << " {" << endl_tab << my_k << endl; set &funcSet = DeviceSymbolFunctions0; @@ -358,6 +363,7 @@ namespace perl { generateSimpleSubstitutions(streamPtr); generateExternShared(streamPtr); generateKernelLaunch(streamPtr); + generateCubNamespace(streamPtr); generateHostFunctions(streamPtr); generateDeviceFunctions(streamPtr); *streamPtr.get() << endl << "# Count of transforms in all files" << endl; @@ -401,6 +407,7 @@ namespace perl { *streamPtr.get() << tab_2 << "simpleSubstitutions();" << endl; *streamPtr.get() << tab_2 << "transformExternShared();" << endl; *streamPtr.get() << tab_2 << "transformKernelLaunch();" << endl; + *streamPtr.get() << tab_2 << "transformCubNamespace();" << endl; *streamPtr.get() << tab_2 << "if ($print_stats) {" << endl; *streamPtr.get() << tab_3 << while_ << "(/(\\b(hip|HIP)([A-Z]|_)\\w+\\b)/g) {" << endl; *streamPtr.get() << tab_4 << "$convertedTags{$1}++;" << endl_tab_3 << "}" << endl_tab_2 << "}" << endl; From 39e7d213cf212d65bc04d400e7455bbc4c7620b5 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 21 Oct 2019 17:50:00 +0300 Subject: [PATCH 33/45] [HIPIFY][tests] Set max clang's CudaArch for corresponding CUDA major.minor version [Reason] To support maximum CUDA features in offline tests + Add defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 restriction for atomicAdd on doubles in atomics.cu. So if LLVM < 7 and --cuda-gpu-arch doesn't work, __CUDA_ARCH__ is unset too (350 by default in clang); if LLVM >= 7 --cuda-gpu-arch is used and __CUDA_ARCH__ is set based on it. --- tests/hipify-clang/lit.cfg | 16 +++++++++++----- tests/hipify-clang/unit_tests/device/atomics.cu | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/hipify-clang/lit.cfg b/tests/hipify-clang/lit.cfg index 79be2b3d39..ea496e6a69 100644 --- a/tests/hipify-clang/lit.cfg +++ b/tests/hipify-clang/lit.cfg @@ -111,12 +111,18 @@ if config.pointer_size == 8: # Set max clang's CudaArch for corresponding CUDA version # to support maximum CUDA features in offline tests if config.cuda_version_major == 7: - clang_arguments += " --cuda-gpu-arch=sm_52" -if config.cuda_version_major == 8: + if config.cuda_version_minor == 5: + clang_arguments += " --cuda-gpu-arch=sm_53" + else: + clang_arguments += " --cuda-gpu-arch=sm_52" +elif config.cuda_version_major == 8: clang_arguments += " --cuda-gpu-arch=sm_62" -if config.cuda_version_major == 9: - clang_arguments += " --cuda-gpu-arch=sm_70" -if config.cuda_version_major == 10: +elif config.cuda_version_major == 9: + if config.cuda_version_minor == 2: + clang_arguments += " --cuda-gpu-arch=sm_72" + else: + clang_arguments += " --cuda-gpu-arch=sm_70" +elif config.cuda_version_major == 10: clang_arguments += " --cuda-gpu-arch=sm_75" # cuDNN ROOT diff --git a/tests/hipify-clang/unit_tests/device/atomics.cu b/tests/hipify-clang/unit_tests/device/atomics.cu index e24b9a2175..3089efe1b8 100644 --- a/tests/hipify-clang/unit_tests/device/atomics.cu +++ b/tests/hipify-clang/unit_tests/device/atomics.cu @@ -276,7 +276,7 @@ int main(int argc, char** argv) { runTest(); runTest(); runTest(); -#if CUDA_VERSION >= 8000 +#if CUDA_VERSION >= 8000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 runTest(); #endif // CHECK: hipDeviceReset(); From 6f88c81a78822ba903fad085d8c6d86653fb7e10 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 22 Oct 2019 11:08:37 +0300 Subject: [PATCH 34/45] [HIPIFY][#1569] Fix --- bin/hipify-perl | 1 - hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp | 2 -- tests/hipify-clang/unit_tests/samples/vec_add.cu | 1 - 3 files changed, 4 deletions(-) diff --git a/bin/hipify-perl b/bin/hipify-perl index 09c8d2a139..80f721c58e 100755 --- a/bin/hipify-perl +++ b/bin/hipify-perl @@ -983,7 +983,6 @@ sub simpleSubstitutions { $ft{'type'} += s/\bcusparsePointerMode_t\b/hipsparsePointerMode_t/g; $ft{'type'} += s/\bcusparseSolvePolicy_t\b/hipsparseSolvePolicy_t/g; $ft{'type'} += s/\bcusparseStatus_t\b/hipsparseStatus_t/g; - $ft{'type'} += s/\bwarpSize\b/hipWarpSize/g; $ft{'numeric_literal'} += s/\bCUBLAS_DIAG_NON_UNIT\b/HIPBLAS_DIAG_NON_UNIT/g; $ft{'numeric_literal'} += s/\bCUBLAS_DIAG_UNIT\b/HIPBLAS_DIAG_UNIT/g; $ft{'numeric_literal'} += s/\bCUBLAS_FILL_MODE_FULL\b/HIPBLAS_FILL_MODE_FULL/g; diff --git a/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp b/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp index fda9b34b2c..7b34d97ab4 100644 --- a/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp +++ b/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp @@ -31,8 +31,6 @@ const std::map CUDA_RUNTIME_TYPE_NAME_MAP { {"cudaChannelFormatDesc", {"hipChannelFormatDesc", "", CONV_TYPE, API_RUNTIME}}, // no analogue {"cudaDeviceProp", {"hipDeviceProp_t", "", CONV_TYPE, API_RUNTIME}}, - // NOTE: int warpSize is a field of cudaDeviceProp - {"warpSize", {"hipWarpSize", "", CONV_TYPE, API_RUNTIME}}, // no analogue {"cudaEglFrame", {"hipEglFrame", "", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, diff --git a/tests/hipify-clang/unit_tests/samples/vec_add.cu b/tests/hipify-clang/unit_tests/samples/vec_add.cu index bc8219bf8c..a6d8950e0b 100644 --- a/tests/hipify-clang/unit_tests/samples/vec_add.cu +++ b/tests/hipify-clang/unit_tests/samples/vec_add.cu @@ -67,7 +67,6 @@ int devcheck(int gpudevice, int rank) cudaError_t cudareturn; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, gpudevice); - // CHECK: if (deviceProp.hipWarpSize <= 1) if (deviceProp.warpSize <= 1) { printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank); From b6e6f12b546a0ebca0c35ae7efeafa0ca4524cdd Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 22 Oct 2019 19:07:37 +0300 Subject: [PATCH 35/45] [HIPIFY] Disable delayed template parsing By implicit unconditional passing -fno-delayed-template-parsing option (which appeared in LLVM 3.8.0, thus doesn't need compatibility wrapping) to hipify-clang. [Reason] To parse uncalled template functions otherwise they are not parsed without calling, thus not hipified. Affects cub_03.cu test, which has uncalled global template function. --- hipify-clang/src/main.cpp | 1 + .../hipify-clang/unit_tests/libraries/CUB/cub_03.cu | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/hipify-clang/src/main.cpp b/hipify-clang/src/main.cpp index 2214567df3..64037c43dd 100644 --- a/hipify-clang/src/main.cpp +++ b/hipify-clang/src/main.cpp @@ -199,6 +199,7 @@ int main(int argc, const char **argv) { Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("cuda", ct::ArgumentInsertPosition::BEGIN)); Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-x", ct::ArgumentInsertPosition::BEGIN)); Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN)); + Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-fno-delayed-template-parsing", ct::ArgumentInsertPosition::BEGIN)); if (!CudaPath.empty()) { std::string sCudaPath = "--cuda-path=" + CudaPath; Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(sCudaPath.c_str(), ct::ArgumentInsertPosition::BEGIN)); diff --git a/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu b/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu index 9fdbc17515..bc914d419d 100644 --- a/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu +++ b/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu @@ -1,8 +1,6 @@ // RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args // CHECK: #include #include -// CHECK: #include -#include // CHECK: #include #include @@ -11,12 +9,14 @@ using namespace cub; // Simple CUDA kernel for computing tiled partial sums template + // CHECK: hipcub::BlockLoadAlgorithm LOAD_ALGO, + cub::BlockLoadAlgorithm LOAD_ALGO, + // CHECK: hipcub::BlockScanAlgorithm SCAN_ALGO> + cub::BlockScanAlgorithm SCAN_ALGO> __global__ void ScanTilesKernel(int *d_in, int *d_out) { // Specialize collective types for problem context - // TODO: typedef cub::BlockLoad BlockLoadT; - typedef BlockLoad BlockLoadT; + // CHECK: typedef ::hipcub::BlockLoad BlockLoadT; + typedef ::cub::BlockLoad BlockLoadT; typedef BlockScan BlockScanT; // Allocate on-chip temporary storage __shared__ union { From 75d70a6714d491c0d465cf9651ac0ec78f5ef4ff Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 23 Oct 2019 18:54:45 +0300 Subject: [PATCH 36/45] [HIPIFY][cmake][#1571] Take into account building hipify-clang as a part of building HIP while installing [Algorithm] [Release] If CMAKE_INSTALL_PREFIX is set by the user: If BIN_INSTALL_DIR is set by HIP, use it as CMAKE_INSTALL_PREFIX, otherwise CMAKE_INSTALL_PREFIX is used unchanged. If the user does not set CMAKE_INSTALL_PREFIX (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT): If BIN_INSTALL_DIR is set by HIP, use it as CMAKE_INSTALL_PREFIX, otherwise use PROJECT_BINARY_DIR/bin for installation. [Debug] If CMAKE_INSTALL_PREFIX is set by the user: CMAKE_INSTALL_PREFIX is used unchanged. If the user does not set CMAKE_INSTALL_PREFIX (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT): use CMAKE_CURRENT_SOURCE_DIR/bin for installation. Standalone build left unchanged: CMAKE_INSTALL_PREFIX is used if set. --- hipify-clang/CMakeLists.txt | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/hipify-clang/CMakeLists.txt b/hipify-clang/CMakeLists.txt index 25076a8895..2f24c6c6fe 100644 --- a/hipify-clang/CMakeLists.txt +++ b/hipify-clang/CMakeLists.txt @@ -80,11 +80,28 @@ endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CFLAGS} ${StdCpp} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DIRS}/clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}\\\"") +set(INSTALL_PATH_DOC_STRING "Installation path for hipify-clang") +if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + if(CMAKE_BUILD_TYPE MATCHES Debug) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) + elseif(CMAKE_BUILD_TYPE MATCHES Release) + if (BIN_INSTALL_DIR) + set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) + else() + set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) + endif() + else() + message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release") + endif() +elseif(BIN_INSTALL_DIR) + set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) +endif() + install(TARGETS hipify-clang DESTINATION ${CMAKE_INSTALL_PREFIX}) install( DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/ - DESTINATION ${CMAKE_INSTALL_PREFIX} + DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT clang-resource-headers FILES_MATCHING PATTERN "*.h" From 359dc79101bf2fa10852750fba179aec7847ba85 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Thu, 24 Oct 2019 08:13:30 -0400 Subject: [PATCH 37/45] [hip] add support for implicit kernel argument for multi-grid sync (#1456) * [hip] add support for implicit kernel argument for multi-grid sync * modified code for calculating the prev_sum * change the impCoopArg type to size_t * add memory clean up * launch init_gws and main kernels into two separate loops --- src/hip_hcc_internal.h | 21 +++++ src/hip_memory.cpp | 194 +++++++++++++++++++++-------------------- src/hip_module.cpp | 95 ++++++++++++++++++-- 3 files changed, 211 insertions(+), 99 deletions(-) diff --git a/src/hip_hcc_internal.h b/src/hip_hcc_internal.h index 3ee14577b0..658cfbf576 100644 --- a/src/hip_hcc_internal.h +++ b/src/hip_hcc_internal.h @@ -1022,6 +1022,27 @@ inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) { namespace hip_internal { hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream); + +hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags); + +hipError_t ihipHostFree(TlsData *tls, void* ptr); + +}; + +#define MAX_COOPERATIVE_GPUs 255 + +// do not change these two structs without changing the device library +struct mg_sync { + uint w0; + uint w1; +}; + +struct mg_info { + struct mg_sync *mgs; + uint grid_id; + uint num_grids; + ulong prev_sum; + ulong all_sum; }; //--- diff --git a/src/hip_memory.cpp b/src/hip_memory.cpp index 48b83287f3..c8369685ec 100644 --- a/src/hip_memory.cpp +++ b/src/hip_memory.cpp @@ -141,6 +141,103 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s return ptr; } +hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) { + hipError_t hip_status = hipSuccess; + + if (HIP_SYNC_HOST_ALLOC) { + hipDeviceSynchronize(); + } + + auto ctx = ihipGetTlsDefaultCtx(); + if ((ctx == nullptr) || (ptr == nullptr)) { + hip_status = hipErrorInvalidValue; + } + else if (sizeBytes == 0) { + hip_status = hipSuccess; + // TODO - should size of 0 return err or be siliently ignored? + } else { + unsigned trueFlags = flags; + if (flags == hipHostMallocDefault) { + // HCC/ROCM provide a modern system with unified memory and should set both of these + // flags by default: + trueFlags = hipHostMallocMapped | hipHostMallocPortable; + } + + + const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | + hipHostMallocWriteCombined | hipHostMallocCoherent | + hipHostMallocNonCoherent; + + + const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; + + if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) { + *ptr = nullptr; + // can't specify unsupported flags, can't specify both Coherent + NonCoherent + hip_status = hipErrorInvalidValue; + } else { + auto device = ctx->getWriteableDevice(); +#if (__hcc_workweek__ >= 19115) + //Avoid mapping host pinned memory to all devices by HCC + unsigned amFlags = amHostUnmapped; +#else + unsigned amFlags = 0; +#endif + if (flags & hipHostMallocCoherent) { + amFlags |= amHostCoherent; + } else if (flags & hipHostMallocNonCoherent) { + amFlags |= amHostNonCoherent; + } else { + // depends on env variables: + amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent; + } + + + *ptr = hip_internal::allocAndSharePtr( + (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx, + true /*shareWithAll*/, amFlags, flags, 0); + + if (sizeBytes && (*ptr == NULL)) { + hip_status = hipErrorMemoryAllocation; + } + } + } + + if (HIP_SYNC_HOST_ALLOC) { + hipDeviceSynchronize(); + } + return hip_status; +} + +hipError_t ihipHostFree(TlsData *tls, void* ptr) { + + // Synchronize to ensure all work has finished. + ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits + // for all activity to finish. + + hipError_t hipStatus = hipErrorInvalidValue; + if (ptr) { + hc::accelerator acc; +#if (__hcc_workweek__ >= 17332) + hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); +#else + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); +#endif + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); + if (status == AM_SUCCESS) { + if (amPointerInfo._hostPointer == ptr) { + hc::am_free(ptr); + hipStatus = hipSuccess; + } + } + } else { + // free NULL pointer succeeds and is common technique to initialize runtime + hipStatus = hipSuccess; + } + + return hipStatus; +} + } // end namespace hip_internal @@ -301,79 +398,12 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag return ihipLogStatus(hip_status); } -hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) { - hipError_t hip_status = hipSuccess; - - if (HIP_SYNC_HOST_ALLOC) { - hipDeviceSynchronize(); - } - - auto ctx = ihipGetTlsDefaultCtx(); - if ((ctx == nullptr) || (ptr == nullptr)) { - hip_status = hipErrorInvalidValue; - } - else if (sizeBytes == 0) { - hip_status = hipSuccess; - // TODO - should size of 0 return err or be siliently ignored? - } else { - unsigned trueFlags = flags; - if (flags == hipHostMallocDefault) { - // HCC/ROCM provide a modern system with unified memory and should set both of these - // flags by default: - trueFlags = hipHostMallocMapped | hipHostMallocPortable; - } - - - const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | - hipHostMallocWriteCombined | hipHostMallocCoherent | - hipHostMallocNonCoherent; - - - const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; - - if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) { - *ptr = nullptr; - // can't specify unsupported flags, can't specify both Coherent + NonCoherent - hip_status = hipErrorInvalidValue; - } else { - auto device = ctx->getWriteableDevice(); -#if (__hcc_workweek__ >= 19115) - //Avoid mapping host pinned memory to all devices by HCC - unsigned amFlags = amHostUnmapped; -#else - unsigned amFlags = 0; -#endif - if (flags & hipHostMallocCoherent) { - amFlags |= amHostCoherent; - } else if (flags & hipHostMallocNonCoherent) { - amFlags |= amHostNonCoherent; - } else { - // depends on env variables: - amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent; - } - - - *ptr = hip_internal::allocAndSharePtr( - (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx, - true /*shareWithAll*/, amFlags, flags, 0); - - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorMemoryAllocation; - } - } - } - - if (HIP_SYNC_HOST_ALLOC) { - hipDeviceSynchronize(); - } - return hip_status; -} hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags); HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; - hip_status = ihipHostMalloc(tls, ptr, sizeBytes, flags); + hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags); return ihipLogStatus(hip_status); } @@ -384,7 +414,7 @@ hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) { if(flags != hipMemAttachGlobal) hip_status = hipErrorInvalidValue; else - hip_status = ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault); + hip_status = hip_internal::ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault); return ihipLogStatus(hip_status); } @@ -2146,30 +2176,8 @@ hipError_t hipFree(void* ptr) { hipError_t hipHostFree(void* ptr) { HIP_INIT_SPECIAL_API(hipHostFree, (TRACE_MEM), ptr); - // Synchronize to ensure all work has finished. - ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits - // for all activity to finish. - - - hipError_t hipStatus = hipErrorInvalidValue; - if (ptr) { - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); - if (status == AM_SUCCESS) { - if (amPointerInfo._hostPointer == ptr) { - hc::am_free(ptr); - hipStatus = hipSuccess; - } - } - } else { - // free NULL pointer succeeds and is common technique to initialize runtime - hipStatus = hipSuccess; - } + hipError_t hipStatus = hipSuccess; + hipStatus = hip_internal::ihipHostFree(tls, ptr); return ihipLogStatus(hipStatus); }; diff --git a/src/hip_module.cpp b/src/hip_module.cpp index b3afdd4ffe..ac239105b8 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -137,7 +137,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, - hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) { + hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0, + void** impCoopParams = 0) { using namespace hip_impl; auto ctx = ihipGetTlsDefaultCtx(); @@ -181,10 +182,17 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global return hipErrorInvalidValue; } - // Insert 48-bytes at the end for implicit kernel arguments and fill with value zero. + // Insert 56-bytes at the end for implicit kernel arguments and fill with value zero. size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1); kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0); + if (impCoopParams) { + const auto p{static_cast(*impCoopParams)}; + // The sixth index is for multi-grid synchronization + kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT, + p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT); + } + /* Kernel argument preparation. */ @@ -449,6 +457,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, return ihipLogStatus(hipErrorLaunchFailure); } + size_t impCoopArg = 1; + void* impCoopParams[1]; + impCoopParams[0] = &impCoopArg; + // launch the main kernel result = ihipModuleLaunchKernel(tls, kd, gridDim.x * blockDimX.x, @@ -456,7 +468,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, gridDim.z * blockDimX.z, blockDimX.x, blockDimX.y, blockDimX.z, sharedMemBytes, stream, kernelParams, nullptr, nullptr, - nullptr, 0, true); + nullptr, 0, true, impCoopParams); stream->criticalData().unlock(); #if (__hcc_workweek__ >= 19213) @@ -472,7 +484,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags); hipError_t result; - if (numDevices > g_deviceCnt || launchParamsList == nullptr) { + if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) { return ihipLogStatus(hipErrorInvalidValue); } @@ -523,6 +535,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi kargs.getHandle()); } + mg_sync *mg_sync_ptr = 0; + mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0}; + + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault); + if (result != hipSuccess) { + return ihipLogStatus(hipErrorInvalidValue); + } + mg_sync_ptr->w0 = 0; + mg_sync_ptr->w1 = 0; + + uint all_sum = 0; + for (int i = 0; i < numDevices; ++i) { + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault); + if (result != hipSuccess) { + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < i; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(hipErrorInvalidValue); + } + // calculate the sum of sizes of all grids + const hipLaunchParams& lp = launchParamsList[i]; + all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * + lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; + } + // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device for (int i = 0; i < numDevices; ++i) { LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false); @@ -531,7 +569,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi #endif } - // launch the init_gws kernel to initialize the GWS followed by launching the main kernels for each device + // launch the init_gws kernel to initialize the GWS for each device for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; @@ -549,8 +587,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); #endif } + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(hipErrorLaunchFailure); } + } + + void* impCoopParams[1]; + ulong prev_sum = 0; + // launch the main kernels for each device + for (int i = 0; i < numDevices; ++i) { + const hipLaunchParams& lp = launchParamsList[i]; + + //initialize and setup the implicit kernel argument for multi-grid sync + mg_info_ptr[i]->mgs = mg_sync_ptr; + mg_info_ptr[i]->grid_id = i; + mg_info_ptr[i]->num_grids = numDevices; + mg_info_ptr[i]->all_sum = all_sum; + mg_info_ptr[i]->prev_sum = prev_sum; + prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * + lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; + + + impCoopParams[0] = &mg_info_ptr[i]; result = ihipModuleLaunchKernel(tls, kds[i], lp.gridDim.x * lp.blockDim.x, @@ -559,7 +621,23 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi lp.blockDim.x, lp.blockDim.y, lp.blockDim.z, lp.sharedMem, lp.stream, lp.args, nullptr, nullptr, nullptr, 0, - true); + true, impCoopParams); + + if (result != hipSuccess) { + for (int j = 0; j < numDevices; ++j) { + launchParamsList[j].stream->criticalData().unlock(); +#if (__hcc_workweek__ >= 19213) + launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); +#endif + } + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + + return ihipLogStatus(hipErrorLaunchFailure); + } + } // unlock all streams @@ -573,6 +651,11 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi free(gwsKds); free(kds); + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(result); } From 4a635add45383161b9d1d0e99f7bfe0727be18d3 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 24 Oct 2019 13:13:49 +0100 Subject: [PATCH 38/45] Improve scalar access into vector types. (#1531) The improvement is based on the ideas here: https://t0rakka.silvrback.com/simd-scalar-accessor. It yields significantly better ISA when the base's .xyzw members are used. --- include/hip/hcc_detail/hip_vector_types.h | 109 +++++++++++++++++++--- 1 file changed, 96 insertions(+), 13 deletions(-) diff --git a/include/hip/hcc_detail/hip_vector_types.h b/include/hip/hcc_detail/hip_vector_types.h index a03a46b8cf..f80745038a 100644 --- a/include/hip/hcc_detail/hip_vector_types.h +++ b/include/hip/hcc_detail/hip_vector_types.h @@ -47,6 +47,95 @@ THE SOFTWARE. #if defined(__cplusplus) #include + namespace hip_impl { + template + struct Scalar_accessor { + // Idea from https://t0rakka.silvrback.com/simd-scalar-accessor + Vector data; + + __host__ __device__ + operator T() const noexcept { return data[idx]; } + + __host__ __device__ + Scalar_accessor& operator=(T x) noexcept { + data[idx] = x; + + return *this; + } + + __host__ __device__ + Scalar_accessor& operator+=(T x) noexcept { + data[idx] += x; + return *this; + } + __host__ __device__ + Scalar_accessor& operator-=(T x) noexcept { + data[idx] -= x; + return *this; + } + + __host__ __device__ + Scalar_accessor& operator*=(T x) noexcept { + data[idx] *= x; + return *this; + } + __host__ __device__ + Scalar_accessor& operator/=(T x) noexcept { + data[idx] /= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator%=(T x) noexcept { + data[idx] %= x; + return *this; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator>>=(T x) noexcept { + data[idx] >>= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator<<=(T x) noexcept { + data[idx] <<= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator&=(T x) noexcept { + data[idx] &= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator|=(T x) noexcept { + data[idx] |= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator^=(T x) noexcept { + data[idx] ^= x; + return *this; + } + }; + } // Namespace hip_impl. + template struct HIP_vector_base; template @@ -55,9 +144,7 @@ THE SOFTWARE. union { Native_vec_ data; - struct { - T x; - }; + hip_impl::Scalar_accessor x; }; }; @@ -67,10 +154,8 @@ THE SOFTWARE. union { Native_vec_ data; - struct { - T x; - T y; - }; + hip_impl::Scalar_accessor x; + hip_impl::Scalar_accessor y; }; }; @@ -238,12 +323,10 @@ THE SOFTWARE. union { Native_vec_ data; - struct { - T x; - T y; - T z; - T w; - }; + hip_impl::Scalar_accessor x; + hip_impl::Scalar_accessor y; + hip_impl::Scalar_accessor z; + hip_impl::Scalar_accessor w; }; }; From c4a51f3679d532c8a4b3b65c36a461edd23d867d Mon Sep 17 00:00:00 2001 From: searlmc1 Date: Thu, 24 Oct 2019 05:14:05 -0700 Subject: [PATCH 39/45] Improve performance of v2 arg handling (#1539) * Improve performance of v2 arg handling * Missing change to `std::string` --- include/hip/hcc_detail/code_object_bundle.hpp | 2 +- src/hip_module.cpp | 33 ++- src/program_state.inl | 192 +++++++++++++----- 3 files changed, 150 insertions(+), 77 deletions(-) diff --git a/include/hip/hcc_detail/code_object_bundle.hpp b/include/hip/hcc_detail/code_object_bundle.hpp index 32b0c0dbc8..f312d2e79b 100644 --- a/include/hip/hcc_detail/code_object_bundle.hpp +++ b/include/hip/hcc_detail/code_object_bundle.hpp @@ -86,7 +86,7 @@ struct Bundled_code { char cbuf[sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)]; } header; std::string triple; - std::vector blob; + std::string blob; }; #define magic_string_ "__CLANG_OFFLOAD_BUNDLE__" diff --git a/src/hip_module.cpp b/src/hip_module.cpp index ac239105b8..2afbabf0a8 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -109,6 +109,7 @@ struct ihipModuleSymbol_t { amd_kernel_code_t const* _header{}; string _name; // TODO - review for performance cost. Name is just used for debug. vector> _kernarg_layout{}; + bool _is_code_object_v3{}; }; template <> @@ -216,8 +217,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global aql.grid_size_x = globalWorkSizeX; aql.grid_size_y = globalWorkSizeY; aql.grid_size_z = globalWorkSizeZ; - bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos; - if (is_code_object_v3) { + if (f->_is_code_object_v3) { const auto* header = reinterpret_cast(f->_header); aql.group_segment_size = @@ -1060,31 +1060,24 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_ // available per CU, therefore we hardcode it to 64 KiRegisters. prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; - bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos; - if (is_code_object_v3) { + if (kd._is_code_object_v3) { r.localSizeBytes = header_v3(kd)->private_segment_fixed_size; r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size; - } else { - r.localSizeBytes = kd._header->workitem_private_segment_byte_size; - r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size; - } - r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes; - if (is_code_object_v3) { r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2; - } else { - r.numRegs = kd._header->workitem_vgpr_count; - } - r.maxThreadsPerBlock = r.numRegs ? - std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) : - prop.maxThreadsPerBlock; - if (is_code_object_v3) { r.binaryVersion = 0; // FIXME: should it be the ISA version or code // object format version? } else { + r.localSizeBytes = kd._header->workitem_private_segment_byte_size; + r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size; + r.numRegs = kd._header->workitem_vgpr_count; r.binaryVersion = kd._header->amd_machine_version_major * 10 + kd._header->amd_machine_version_minor; } + r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes; + r.maxThreadsPerBlock = r.numRegs ? + std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) : + prop.maxThreadsPerBlock; r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0. return r; @@ -1182,8 +1175,7 @@ hipError_t ihipModuleLoadData(TlsData *tls, hipModule_t* module, const void* ima content.data(), content.size(), (*module)->executable, this_agent()); - std::vector blob(content.cbegin(), content.cend()); - program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs); + program_state_impl::read_kernarg_metadata(content, (*module)->kernargs); // compute the hash of the code object (*module)->hash = checksum(content.length(), content.data()); @@ -1235,8 +1227,7 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS) { - bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos; - if (is_code_object_v3) { + if (f->_is_code_object_v3) { const auto header = reinterpret_cast(f->_header); // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1 // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1) diff --git a/src/program_state.inl b/src/program_state.inl index 4f05d2763e..9feabbc2f7 100644 --- a/src/program_state.inl +++ b/src/program_state.inl @@ -89,9 +89,10 @@ struct Symbol { class Kernel_descriptor { std::uint64_t kernel_object_{}; - amd_kernel_code_t const* kernel_header_{nullptr}; - std::string name_{}; + amd_kernel_code_t const* header_{}; + std::string name_; std::vector> kernarg_layout_{}; + bool is_code_object_v3_{}; public: Kernel_descriptor() = default; Kernel_descriptor( @@ -101,7 +102,8 @@ public: : kernel_object_{kernel_object}, name_{name}, - kernarg_layout_{std::move(kernarg_layout)} + kernarg_layout_{std::move(kernarg_layout)}, + is_code_object_v3_{name.find(".kd") != std::string::npos} { bool supported{false}; std::uint16_t min_v{UINT16_MAX}; @@ -123,7 +125,7 @@ public: r = tbl.hsa_ven_amd_loader_query_host_address( reinterpret_cast(kernel_object_), - reinterpret_cast(&kernel_header_)); + reinterpret_cast(&header_)); if (r != HSA_STATUS_SUCCESS) return; } @@ -149,7 +151,7 @@ public: std::string, std::unordered_map< hsa_isa_t, - std::vector>>>> code_object_blobs; + std::vector>>> code_object_blobs; std::pair< std::once_flag, @@ -213,7 +215,7 @@ public: std::string, std::unordered_map< hsa_isa_t, - std::vector>>>& get_code_object_blobs() { + std::vector>>& get_code_object_blobs() { std::call_once(code_object_blobs.first, [this]() { dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) { @@ -584,6 +586,68 @@ public: return functions[agent].second; } + static + std::size_t parse_args_v2( + const std::string& metadata, + std::size_t f, + std::size_t l, + std::vector>& size_align) { + if (f == l) return f; + if (!size_align.empty()) return l; + + do { + static constexpr size_t size_sz{5}; + f = metadata.find("Size:", f) + size_sz; + + if (l <= f) return f; + + auto size = std::strtoul(&metadata[f], nullptr, 10); + + static constexpr size_t align_sz{6}; + f = metadata.find("Align:", f) + align_sz; + + char* l{}; + auto align = std::strtoul(&metadata[f], &l, 10); + + f += (l - &metadata[f]) + 1; + + size_align.emplace_back(size, align); + } while (true); + } + + static + void read_kernarg_metadata_v2( + const std::string& kernels_md, + std::size_t dx, + std::unordered_map< + std::string, + std::vector>>& kernargs) { + do { + dx = kernels_md.find("Name:", dx); + + if (dx == std::string::npos) break; + + static constexpr decltype(kernels_md.size()) name_sz{5}; + dx = kernels_md.find_first_not_of(" '", dx + name_sz); + + auto fn = + kernels_md.substr(dx, kernels_md.find_first_of("'\n", dx) - dx); + dx += fn.size(); + + auto dx1 = kernels_md.find("CodeProps", dx); + dx = kernels_md.find("Args:", dx); + + if (dx1 < dx) { + dx = dx1; + continue; + } + if (dx == std::string::npos) break; + + static constexpr decltype(kernels_md.size()) args_sz{5}; + dx = parse_args_v2(kernels_md, dx + args_sz, dx1, kernargs[fn]); + } while (true); + } + static std::string metadata_to_string(const amd_comgr_metadata_node_t& md) { std::string str; @@ -598,9 +662,8 @@ public: } static - void parse_args( + void parse_args_v3( const amd_comgr_metadata_node_t& args_md, - bool is_code_object_v3, std::vector>& size_align) { size_t arg_count = 0; if (amd_comgr_get_metadata_list_size(args_md, &arg_count) @@ -615,9 +678,7 @@ public: return; amd_comgr_metadata_node_t arg_size_md; - if (amd_comgr_metadata_lookup(arg_md, - is_code_object_v3 ? ".size" : "Size", - &arg_size_md) + if (amd_comgr_metadata_lookup(arg_md, ".size", &arg_size_md) != AMD_COMGR_STATUS_SUCCESS) return; @@ -629,35 +690,21 @@ public: size_t arg_align; - if (is_code_object_v3) { - amd_comgr_metadata_node_t arg_offset_md; - if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md) - != AMD_COMGR_STATUS_SUCCESS) - return; + amd_comgr_metadata_node_t arg_offset_md; + if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md) + != AMD_COMGR_STATUS_SUCCESS) + return; - size_t arg_offset - = std::stoul(metadata_to_string(arg_offset_md)); + size_t arg_offset = std::stoul(metadata_to_string(arg_offset_md)); - if (amd_comgr_destroy_metadata(arg_offset_md) - != AMD_COMGR_STATUS_SUCCESS) - return; + if (amd_comgr_destroy_metadata(arg_offset_md) + != AMD_COMGR_STATUS_SUCCESS) + return; - arg_align = 1; - while (arg_offset && (arg_offset & 1) == 0) { - arg_offset >>= 1; - arg_align <<= 1; - } - } else { - amd_comgr_metadata_node_t arg_align_md; - if (amd_comgr_metadata_lookup(arg_md, "Align", &arg_align_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - arg_align = std::stoul(metadata_to_string(arg_align_md)); - - if (amd_comgr_destroy_metadata(arg_align_md) - != AMD_COMGR_STATUS_SUCCESS) - return; + arg_align = 1; + while (arg_offset && (arg_offset & 1) == 0) { + arg_offset >>= 1; + arg_align <<= 1; } size_align.emplace_back(arg_size, arg_align); @@ -669,11 +716,11 @@ public: } static - void read_kernarg_metadata( - const std::vector& blob, + void read_kernarg_metadata_v3( + const std::string& blob, std::unordered_map< - std::string, - std::vector>>& kernargs) { + std::string, + std::vector>>& kernargs) { amd_comgr_data_t dataIn; amd_comgr_status_t status; @@ -690,7 +737,6 @@ public: != AMD_COMGR_STATUS_SUCCESS) return; - bool is_code_object_v3 = false; amd_comgr_metadata_node_t kernels_md; if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md) != AMD_COMGR_STATUS_SUCCESS) { @@ -699,7 +745,6 @@ public: &kernels_md) != AMD_COMGR_STATUS_SUCCESS) return; - is_code_object_v3 = true; } size_t kernel_count = 0; @@ -715,9 +760,7 @@ public: continue; amd_comgr_metadata_node_t name_md; - if (amd_comgr_metadata_lookup(kernel_md, - is_code_object_v3 ? ".name" : "Name", - &name_md) + if (amd_comgr_metadata_lookup(kernel_md, ".name", &name_md) != AMD_COMGR_STATUS_SUCCESS) continue; @@ -727,21 +770,15 @@ public: != AMD_COMGR_STATUS_SUCCESS) continue; - if (is_code_object_v3) - kernel_name_str.append(".kd"); - - amd_comgr_metadata_node_t args_md; - if (amd_comgr_metadata_lookup(kernel_md, - is_code_object_v3 ? ".args" : "Args", - &args_md) + if (amd_comgr_metadata_lookup(kernel_md, ".args", &args_md) != AMD_COMGR_STATUS_SUCCESS) continue; auto foundKernel = kernargs.find(kernel_name_str); // parse arguments for a given kernel only once if (foundKernel == kernargs.end()) { - parse_args(args_md, is_code_object_v3, kernargs[kernel_name_str]); + parse_args_v3(args_md, kernargs[kernel_name_str]); } if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS @@ -757,7 +794,52 @@ public: amd_comgr_release_data(dataIn); } - const std::unordered_map>>& kernargs) + { + std::istringstream istr{blob}; + ELFIO::elfio reader; + + if (!reader.load(istr)) return; + + // TODO: this is inefficient. + auto it = find_section_if(reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_NOTE; + }); + + if (!it) return; + + const ELFIO::note_section_accessor acc{reader, it}; + auto n{acc.get_notes_num()}; + while (n--) { + ELFIO::Elf_Word type{}; + std::string name{}; + void* desc{}; + ELFIO::Elf_Word desc_size{}; + + acc.get_note(n, type, name, desc, desc_size); + + if (name == "AMDGPU") { + return read_kernarg_metadata_v3(blob, kernargs); + } + if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA. + + std::string tmp{ + static_cast(desc), static_cast(desc) + desc_size}; + + auto dx = tmp.find("Kernels:"); + + if (dx == std::string::npos) continue; + + return read_kernarg_metadata_v2(tmp, dx + 8u, kernargs); // Skip "Kernels:". + } + } + + const std::unordered_map>>& get_kernargs() { std::call_once(kernargs.first, [this]() { From af351d7e1b40a041551e205a39f9c20a68e03399 Mon Sep 17 00:00:00 2001 From: satyanveshd <53337087+satyanveshd@users.noreply.github.com> Date: Thu, 24 Oct 2019 17:44:47 +0530 Subject: [PATCH 40/45] Fix occupany APIs (#1560) Addresses SWDEV-205006 --- .../hip/hcc_detail/functional_grid_launch.hpp | 29 ------------ include/hip/hcc_detail/hip_runtime_api.h | 36 +++++++++++---- samples/2_Cookbook/13_occupancy/occupancy.cpp | 8 ++-- src/hip_module.cpp | 45 ++++++++++--------- .../module/hipLaunchCooperativeKernel.cpp | 2 +- ...upancyMaxActiveBlocksPerMultiprocessor.cpp | 21 ++------- .../hipOccupancyMaxPotentialBlockSize.cpp | 20 +-------- 7 files changed, 63 insertions(+), 98 deletions(-) diff --git a/include/hip/hcc_detail/functional_grid_launch.hpp b/include/hip/hcc_detail/functional_grid_launch.hpp index c493eec933..da57240ebd 100644 --- a/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/include/hip/hcc_detail/functional_grid_launch.hpp @@ -127,35 +127,6 @@ void hipLaunchKernelGGLImpl( } // Namespace hip_impl. -template -inline -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, - F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) { - - using namespace hip_impl; - - hip_impl::hip_init(); - auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), - target_agent(0)); - - return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, - dynSharedMemPerBlk, blockSizeLimit); -} - -template -inline -hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel, - uint32_t blockSize, size_t dynSharedMemPerBlk) { - - using namespace hip_impl; - - hip_impl::hip_init(); - auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), - target_agent(0)); - - return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk); -} - template inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks, diff --git a/include/hip/hcc_detail/hip_runtime_api.h b/include/hip/hcc_detail/hip_runtime_api.h index b4402fd67a..3a81305ba3 100644 --- a/include/hip/hcc_detail/hip_runtime_api.h +++ b/include/hip/hcc_detail/hip_runtime_api.h @@ -2884,14 +2884,14 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi * @param [out] gridSize minimum grid size for maximum potential occupancy * @param [out] blockSize block size for maximum potential occupancy * @param [in] f kernel function for which occupancy is calulated - * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes * @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit * * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue */ -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit); +hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + const void* f, size_t dynamicSMemSize, + int blockSizeLimit); /** * @brief Returns occupancy for a device function. @@ -2899,10 +2899,10 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block * @param [out] numBlocks Returned occupancy * @param [in] func Kernel function for which occupancy is calulated * @param [in] blockSize Block size the kernel is intended to be launched with - * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk); + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize); /** * @brief Returns occupancy for a device function. @@ -2910,11 +2910,11 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( * @param [out] numBlocks Returned occupancy * @param [in] func Kernel function for which occupancy is calulated * @param [in] blockSize Block size the kernel is intended to be launched with - * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes * @param [in] flags Extra flags for occupancy calculation (currently ignored) */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags); + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags); /** * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched @@ -3320,7 +3320,27 @@ hipError_t hipBindTextureToMipmappedArray(const texture& tex, return hipSuccess; } +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) { + return hipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize); +} +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) { + return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize, flags); +} + +template +inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + T f, size_t dynamicSMemSize, int blockSizeLimit) { + return hipOccupancyMaxPotentialBlockSize( + gridSize, blockSize, reinterpret_cast(f), dynamicSMemSize, blockSizeLimit); +} + template inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) { diff --git a/samples/2_Cookbook/13_occupancy/occupancy.cpp b/samples/2_Cookbook/13_occupancy/occupancy.cpp index a9f4e198b0..01fa7aafed 100644 --- a/samples/2_Cookbook/13_occupancy/occupancy.cpp +++ b/samples/2_Cookbook/13_occupancy/occupancy.cpp @@ -56,9 +56,9 @@ void launchKernel(float* C, float* A, float* B, bool manual){ const unsigned threadsperblock = 32; const unsigned blocks = (NUM/threadsperblock)+1; - uint32_t mingridSize = 0; - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int mingridSize = 0; + int gridSize = 0; + int blockSize = 0; if (manual){ blockSize = threadsperblock; @@ -86,7 +86,7 @@ void launchKernel(float* C, float* A, float* B, bool manual){ printf("kernel Execution time = %6.3fms\n", eventMs); //Calculate Occupancy - uint32_t numBlock = 0; + int numBlock = 0; HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0)); if(devProp.maxThreadsPerMultiProcessor){ diff --git a/src/hip_module.cpp b/src/hip_module.cpp index 2afbabf0a8..c15cc34cd5 100644 --- a/src/hip_module.cpp +++ b/src/hip_module.cpp @@ -1248,9 +1248,9 @@ void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size } } -hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit) +hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, int* gridSize, int* blockSize, + hipFunction_t f, size_t dynamicSMemSize, + int blockSizeLimit) { using namespace hip_impl; @@ -1331,7 +1331,7 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, } else { size_t availableSharedMemPerCU = prop.maxSharedMemoryPerMultiProcessor; - size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynSharedMemPerBlk); + size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynamicSMemSize); wavefrontsLDS = min(workgroupPerCU, maxWorkgroupPerCU) * wavefrontsPerWG; } @@ -1360,18 +1360,19 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, return hipSuccess; } -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit) +hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + const void* f, size_t dynamicSMemSize, + int blockSizeLimit) { - HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit); - + HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynamicSMemSize, blockSizeLimit); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls, - gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)); + gridSize, blockSize, F, dynamicSMemSize, blockSizeLimit)); } hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( - TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynamicSMemSize) { using namespace hip_impl; @@ -1411,35 +1412,39 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts)); - size_t total_used_lds = usedLDS + dynSharedMemPerBlk; + size_t total_used_lds = usedLDS + dynamicSMemSize; if (total_used_lds != 0) { // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; - *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); + *numBlocks = std::min(*numBlocks, (int) lds_occupancy); } return hipSuccess; } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize) { - HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); + HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynamicSMemSize); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynamicSMemSize)); } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags) { - HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); + HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynamicSMemSize, flags); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynamicSMemSize)); } hipError_t hipLaunchKernel( diff --git a/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp index 89d003ea94..8089f26f16 100644 --- a/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp +++ b/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp @@ -116,7 +116,7 @@ int main() { dimBlock.x = workgroups[i]; // Calculate the device occupancy to know how many blocks can be run concurrently - hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast(&numBlocks), + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)); dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32); diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp b/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp index ebf656b72f..2838c09cd1 100644 --- a/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp +++ b/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp @@ -30,10 +30,6 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#define fileName "vcpy_kernel.code" -#define kernel_name "hello_world" - - __global__ void f1(float *a) { *a = 1.0; } template @@ -44,16 +40,15 @@ __global__ void f2(T *a) { *a = 1; } int main(int argc, char* argv[]) { // test case for using kernel function pointer - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int gridSize = 0; + int blockSize = 0; hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); assert(gridSize != 0 && blockSize != 0); - uint32_t numBlock = 0; + int numBlock = 0; hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0); assert(numBlock != 0); - // test case for using kernel function pointer with template gridSize = 0; blockSize = 0; @@ -64,15 +59,5 @@ int main(int argc, char* argv[]) { hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f2, blockSize, 0); assert(numBlock != 0); - - // test case for using kernel with hipFunction_t type - numBlock = 0; - hipModule_t Module; - hipFunction_t Function; - HIPCHECK(hipModuleLoad(&Module, fileName)); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0)); - assert(numBlock != 0); - passed(); } diff --git a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp b/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp index a81862952d..22a3f05283 100644 --- a/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp +++ b/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp @@ -30,22 +30,16 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#define fileName "vcpy_kernel.code" -#define kernel_name "hello_world" - - __global__ void f1(float *a) { *a = 1.0; } template __global__ void f2(T *a) { *a = 1; } - - int main(int argc, char* argv[]) { // test case for using kernel function pointer - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int gridSize = 0; + int blockSize = 0; hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); assert(gridSize != 0 && blockSize != 0); @@ -55,15 +49,5 @@ int main(int argc, char* argv[]) { hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f2, 0, 0); assert(gridSize != 0 && blockSize != 0); - // test case for using kernel with hipFunction_t type - gridSize = 0; - blockSize = 0; - hipModule_t Module; - hipFunction_t Function; - HIPCHECK(hipModuleLoad(&Module, fileName)); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - HIPCHECK(hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, Function, 0, 0)); - assert(gridSize != 0 && blockSize != 0); - passed(); } From 9ba25b42c895ea10bfb4e7ce07e42f6ad2e01cf6 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 24 Oct 2019 13:15:20 +0100 Subject: [PATCH 41/45] Make CAS loops use the TTAS idiom. (#1573) * Make CAS loops use the TTAS idiom. * More efficient re-formulation of TTAS. * Fix typo. * The typo was not quite a typo --- include/hip/hcc_detail/hip_atomic.h | 42 ++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/include/hip/hcc_detail/hip_atomic.h b/include/hip/hcc_detail/hip_atomic.h index a5ac94a74b..2c13411319 100644 --- a/include/hip/hcc_detail/hip_atomic.h +++ b/include/hip/hcc_detail/hip_atomic.h @@ -59,12 +59,17 @@ float atomicAdd(float* address, float val) { unsigned int* uaddr{reinterpret_cast(address)}; unsigned int old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; - unsigned int r; + unsigned int r; do { - r = old; + r = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + + if (r != old) { r = old; continue; } + old = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r))); - } while (r != old); + + if (r == old) break; + } while (true); return __uint_as_float(r); } @@ -74,13 +79,18 @@ double atomicAdd(double* address, double val) { unsigned long long* uaddr{reinterpret_cast(address)}; unsigned long long old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; - unsigned long long r; + unsigned long long r; do { - r = old; + r = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + + if (r != old) { r = old; continue; } + old = atomicCAS( uaddr, r, __double_as_longlong(val + __longlong_as_double(r))); - } while (r != old); + + if (r == old) break; + } while (true); return __longlong_as_double(r); } @@ -144,7 +154,13 @@ unsigned long long atomicMin( unsigned long long* address, unsigned long long val) { unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; - while (val < tmp) { tmp = atomicCAS(address, tmp, val); } + while (val < tmp) { + const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED); + + if (tmp1 != tmp) { tmp = tmp1; continue; } + + tmp = atomicCAS(address, tmp, val); + } return tmp; } @@ -167,7 +183,13 @@ unsigned long long atomicMax( unsigned long long* address, unsigned long long val) { unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; - while (tmp < val) { tmp = atomicCAS(address, tmp, val); } + while (tmp < val) { + const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED); + + if (tmp1 != tmp) { tmp = tmp1; continue; } + + tmp = atomicCAS(address, tmp, val); + } return tmp; } @@ -177,7 +199,7 @@ inline unsigned int atomicInc(unsigned int* address, unsigned int val) { __device__ - extern + extern unsigned int __builtin_amdgcn_atomic_inc( unsigned int*, unsigned int, @@ -194,7 +216,7 @@ inline unsigned int atomicDec(unsigned int* address, unsigned int val) { __device__ - extern + extern unsigned int __builtin_amdgcn_atomic_dec( unsigned int*, unsigned int, From 81952ce5a77ced752144bb7521b4cae87a26a7b0 Mon Sep 17 00:00:00 2001 From: gandryey <56892148+gandryey@users.noreply.github.com> Date: Thu, 24 Oct 2019 08:15:42 -0400 Subject: [PATCH 42/45] Hip vdi profiling header (#1577) Add HIP-VDI profiling interface for GPU timing collection. --- include/hip/hcc_detail/hip_runtime_prof.h | 77 +++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 include/hip/hcc_detail/hip_runtime_prof.h diff --git a/include/hip/hcc_detail/hip_runtime_prof.h b/include/hip/hcc_detail/hip_runtime_prof.h new file mode 100644 index 0000000000..4d4eccb54d --- /dev/null +++ b/include/hip/hcc_detail/hip_runtime_prof.h @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H +#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H + +// HIP VDI Op IDs enumeration +enum HipVdiOpId { + kHipVdiOpIdDispatch = 0, + kHipVdiOpIdCopy = 1, + kHipVdiOpIdBarrier = 2, + kHipVdiOpIdNumber = 3 +}; + +// Types of VDI commands +enum HipVdiCommandKind { + kHipVdiCommandKernel = 0x11F0, + kHipVdiMemcpyDeviceToHost = 0x11F3, + kHipHipVdiMemcpyHostToDevice = 0x11F4, + kHipVdiMemcpyDeviceToDevice = 0x11F5, + kHipVidMemcpyDeviceToHostRect = 0x1201, + kHipVdiMemcpyHostToDeviceRect = 0x1202, + kHipVdiMemcpyDeviceToDeviceRect = 0x1203, + kHipVdiFillMemory = 0x1207, +}; + +/** + * @brief Initializes activity callback + * + * @param [input] id_callback Event ID callback function + * @param [input] op_callback Event operation callback function + * @param [input] arg Arguments passed into callback + * + * @returns None + */ +void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg); + +/** + * @brief Enables activity callback + * + * @param [input] op Operation, which will trigger a callback (@see HipVdiOpId) + * @param [input] enable Enable state for the callback + * + * @returns True if successful + */ +bool hipEnableActivityCallback(uint32_t op, bool enable); + +/** + * @brief Returns the description string for the operation kind + * + * @param [input] id Command kind id (@see HipVdiCommandKind) + * + * @returns A pointer to a const string with the command description + */ +const char* hipGetCmdName(uint32_t id); + +#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H + From 04e10814d876346de637fa4c622cb7e62153ce90 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 24 Oct 2019 05:15:51 -0700 Subject: [PATCH 43/45] Add HIP checks in texture driver sample (#1581) --- .../11_texture_driver/texture2dDrv.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp b/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp index b3c1ef5d0c..3be6d12b31 100755 --- a/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp +++ b/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp @@ -62,7 +62,7 @@ bool runTest(int argc, char** argv) { desc.NumChannels = 1; desc.Width = width; desc.Height = height; - hipArrayCreate(&array, &desc); + HIP_CHECK(hipArrayCreate(&array, &desc)); hip_Memcpy2D copyParam; memset(©Param, 0, sizeof(copyParam)); @@ -73,19 +73,19 @@ bool runTest(int argc, char** argv) { copyParam.srcPitch = width * sizeof(float); copyParam.WidthInBytes = copyParam.srcPitch; copyParam.Height = height; - hipMemcpyParam2D(©Param); + HIP_CHECK(hipMemcpyParam2D(©Param)); textureReference* texref; - hipModuleGetTexRef(&texref, Module, "tex"); - hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap); - hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap); - hipTexRefSetFilterMode(texref, hipFilterModePoint); - hipTexRefSetFlags(texref, 0); - hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1); - hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT); + HIP_CHECK(hipModuleGetTexRef(&texref, Module, "tex")); + HIP_CHECK(hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap)); + HIP_CHECK(hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap)); + HIP_CHECK(hipTexRefSetFilterMode(texref, hipFilterModePoint)); + HIP_CHECK(hipTexRefSetFlags(texref, 0)); + HIP_CHECK(hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1)); + HIP_CHECK(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT)); float* dData = NULL; - hipMalloc((void**)&dData, size); + HIP_CHECK(hipMalloc((void**)&dData, size)); struct { void* _Ad; @@ -112,7 +112,7 @@ bool runTest(int argc, char** argv) { float* hOutputData = (float*)malloc(size); memset(hOutputData, 0, size); - hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost); + HIP_CHECK(hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost)); for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { @@ -124,8 +124,8 @@ bool runTest(int argc, char** argv) { } } } - hipFree(dData); - hipFreeArray(array); + HIP_CHECK(hipFree(dData)); + HIP_CHECK(hipFreeArray(array)); return testResult; } From 770d3412f8387ca166e7153eb1b6d2236239fbdb Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com> Date: Thu, 24 Oct 2019 17:46:06 +0530 Subject: [PATCH 44/45] Adding New Analyze Target Merging with cppcheck (#1583) --- CMakeLists.txt | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 28b8683b22..57369a9039 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -504,7 +504,7 @@ endif() find_program(CPPCHECK_EXE cppcheck) if(CPPCHECK_EXE) add_custom_target(cppcheck COMMAND ${CPPCHECK_EXE} --force --quiet --enable=warning,performance,portability,information,missingInclude src include -I /opt/rocm/include/hcc -I /opt/rocm/include --suppress=*:/opt/rocm/include/hcc/hc.hpp - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() ############################# @@ -545,4 +545,17 @@ else() message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location") endif() +############################# +# Code analysis +############################# +# Target: clang +if(HIP_HIPCC_EXECUTABLE) + add_custom_target(analyze + COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./ + WORKING_DIRECTORY ${HIP_SRC_PATH}) + if(CPPCHECK_EXE) + add_dependencies(analyze cppcheck) + endif() +endif() + # vim: ts=4:sw=4:expandtab:smartindent From 70f2cd13172bc1afe1caef0837807409f02f80e7 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 24 Oct 2019 05:21:55 -0700 Subject: [PATCH 45/45] Update profiling doc (#1576) --- docs/markdown/hip_profiling.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/markdown/hip_profiling.md b/docs/markdown/hip_profiling.md index 8a44368680..28ed37e321 100644 --- a/docs/markdown/hip_profiling.md +++ b/docs/markdown/hip_profiling.md @@ -196,8 +196,7 @@ This file can be copied and edited to provide more selective HSA event recording #### How to enable profiling at HIP build time -Recent pre-built packages of HIP are always built with profiling support enabled. -For developer builds, you must enable marker support manually when compiling HIP. +Pre-built packages of HIP are not built with profiling support enabled.You must enable marker support manually when compiling HIP. 1. Build HIP with ATP markers enabled HIP pre-built packages are enabled with ATP marker support by default.