From 4b51111a1daad203996519bd63377ca3bbe7ecb9 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 14 Oct 2019 16:32:57 +0300 Subject: [PATCH 01/45] [HIPIFY][build fix] Add missing new source file [ROCm/clr commit: 01ae988faba639927fe55d94c71d18c3a1ae50e6] --- .../src/CUDA2HIP_CUB_API_types.cpp | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 projects/clr/hipamd/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp diff --git a/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp b/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp new file mode 100644 index 0000000000..0ef1912b54 --- /dev/null +++ b/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_CUB_API_types.cpp @@ -0,0 +1,28 @@ +/* +Copyright (c) 2015 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "CUDA2HIP.h" + +// Maps the names of CUDA CUB API types to the corresponding HIP types +const std::map CUDA_CUB_TYPE_NAME_MAP{ + {"cub", {"hipcub", "", CONV_TYPE, API_CUB}}, +}; From 15d27444504151d86e1a79051d8c15878cbbaf14 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 14 Oct 2019 18:08:29 +0300 Subject: [PATCH 02/45] [HIPIFY] HipifyAction code clean-up [ROCm/clr commit: 0025c24fe95c7497a8ecbd8dca4c5d312af2932b] --- .../hipamd/hipify-clang/src/HipifyAction.cpp | 301 ++++++++---------- .../hipamd/hipify-clang/src/HipifyAction.h | 28 +- 2 files changed, 141 insertions(+), 188 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp b/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp index c6d10f0cdc..930f3ec8c5 100644 --- a/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp +++ b/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp @@ -33,16 +33,20 @@ THE SOFTWARE. #include "StringUtils.h" #include "ArgParse.h" -namespace ct = clang::tooling; -namespace mat = clang::ast_matchers; - +const std::string sHIP = "HIP"; +const std::string sROC = "ROC"; +const std::string sCub = "cub"; const std::string sHIP_DYNAMIC_SHARED = "HIP_DYNAMIC_SHARED"; +const std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME"; std::string sHIP_SYMBOL = "HIP_SYMBOL"; -std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME"; std::string s_reinterpret_cast = "reinterpret_cast"; -const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL("; +const std::string sHipLaunchKernelGGL = "hipLaunchKernelGGL"; const std::string sDim3 = "dim3("; - +const std::string s_hiprand_kernel_h = "hiprand_kernel.h"; +const std::string s_hiprand_h = "hiprand.h"; +const std::string sOnce = "once"; +const std::string s_string_literal = "[string literal]"; +// CUDA identifiers, used in matchers const std::string sCudaMemcpyToSymbol = "cudaMemcpyToSymbol"; const std::string sCudaMemcpyToSymbolAsync = "cudaMemcpyToSymbolAsync"; const std::string sCudaGetSymbolSize = "cudaGetSymbolSize"; @@ -51,6 +55,12 @@ const std::string sCudaMemcpyFromSymbol = "cudaMemcpyFromSymbol"; const std::string sCudaMemcpyFromSymbolAsync = "cudaMemcpyFromSymbolAsync"; const std::string sCudaFuncSetCacheConfig = "cudaFuncSetCacheConfig"; const std::string sCudaFuncGetAttributes = "cudaFuncGetAttributes"; +// Matchers' names +const StringRef sCudaSharedIncompleteArrayVar = "cudaSharedIncompleteArrayVar"; +const StringRef sCudaLaunchKernel = "cudaLaunchKernel"; +const StringRef sCudaHostFuncCall = "cudaHostFuncCall"; +const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall"; +const StringRef sCubNamespacePrefix = "cubNamespacePrefix"; std::set DeviceSymbolFunctions0 { {sCudaMemcpyToSymbol}, @@ -78,7 +88,7 @@ std::set ReinterpretFunctions1{ }; void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + auto &SM = getCompilerInstance().getSourceManager(); size_t begin = 0; while ((begin = s.find("cu", begin)) != StringRef::npos) { const size_t end = s.find_first_of(" ", begin + 4); @@ -86,7 +96,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { const auto found = CUDA_RENAMES_MAP().find(name); if (found != CUDA_RENAMES_MAP().end()) { StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName; - hipCounter counter = {"[string literal]", "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree}; + hipCounter counter = {s_string_literal, "", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.supportDegree}; Statistics::current().incrementCounter(counter, name.str()); if (!Statistics::isUnsupported(counter)) { clang::SourceLocation sl = start.getLocWithOffset(begin + 1); @@ -95,9 +105,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { insertReplacement(Rep, fullSL); } } - if (end == StringRef::npos) { - break; - } + if (end == StringRef::npos) break; begin = end + 1; } } @@ -109,7 +117,7 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { * If it's an unsupported CUDA identifier, a warning is emitted. * Otherwise, the source file is updated with the corresponding hipification. */ -void HipifyAction::RewriteToken(const clang::Token& t) { +void HipifyAction::RewriteToken(const clang::Token &t) { // String literals containing CUDA references need fixing. if (t.is(clang::tok::string_literal)) { StringRef s(t.getLiteralData(), t.getLength()); @@ -124,20 +132,21 @@ void HipifyAction::RewriteToken(const clang::Token& t) { FindAndReplace(name, sl, CUDA_RENAMES_MAP()); } -void HipifyAction::FindAndReplace(llvm::StringRef name, +void HipifyAction::FindAndReplace(StringRef name, clang::SourceLocation sl, - const std::map& repMap, bool bReplace) { + const std::map &repMap, + bool bReplace) { const auto found = repMap.find(name); if (found == repMap.end()) { // So it's an identifier, but not CUDA? Boring. return; } Statistics::current().incrementCounter(found->second, name.str()); - clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics(); + clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics(); // Warn the user about unsupported identifier. if (Statistics::isUnsupported(found->second)) { std::string sWarn; - Statistics::isToRoc(found->second) ? sWarn = "ROC" : sWarn = "HIP"; + Statistics::isToRoc(found->second) ? sWarn = sROC : sWarn = sHIP; sWarn = "" + sWarn; const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier is unsupported in %0."); DE.Report(sl, ID) << sWarn; @@ -147,7 +156,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name, return; } StringRef repName = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName; - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + auto &SM = getCompilerInstance().getSourceManager(); ct::Replacement Rep(SM, sl, name.size(), repName.str()); clang::FullSourceLoc fullSL(sl, SM); insertReplacement(Rep, fullSL); @@ -155,7 +164,7 @@ void HipifyAction::FindAndReplace(llvm::StringRef name, namespace { -clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) { +clang::SourceRange getReadRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) { clang::SourceLocation begin = exprRange.getBegin(); clang::SourceLocation end = exprRange.getEnd(); bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{}); @@ -167,7 +176,7 @@ clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRan } } -clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) { +clang::SourceRange getWriteRange(clang::SourceManager &SM, const clang::SourceRange &exprRange) { clang::SourceLocation begin = exprRange.getBegin(); clang::SourceLocation end = exprRange.getEnd(); // If the range is contained within a macro, update the macro definition. @@ -178,7 +187,7 @@ clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRa return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)}; } -StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) { +StringRef readSourceText(clang::SourceManager &SM, const clang::SourceRange &exprRange) { return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr); } @@ -186,53 +195,50 @@ StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exp * Get a string representation of the expression `arg`, unless it's a defaulting function * call argument, in which case get a 0. Used for building argument lists to kernel calls. */ -std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Expr* arg) { - if (clang::isa(arg)) { - return "0"; - } else { - return readSourceText(SM, arg->getSourceRange()); - } +std::string stringifyZeroDefaultedArg(clang::SourceManager &SM, const clang::Expr *arg) { + if (clang::isa(arg)) return "0"; + else return readSourceText(SM, arg->getSourceRange()); } } // anonymous namespace -bool HipifyAction::Exclude(const hipCounter & hipToken) { +bool HipifyAction::Exclude(const hipCounter &hipToken) { switch (hipToken.type) { case CONV_INCLUDE_CUDA_MAIN_H: switch (hipToken.apiType) { case API_DRIVER: case API_RUNTIME: - if (insertedRuntimeHeader) { return true; } + if (insertedRuntimeHeader) return true; insertedRuntimeHeader = true; return false; case API_BLAS: - if (insertedBLASHeader) { return true; } + if (insertedBLASHeader) return true; insertedBLASHeader = true; return false; case API_RAND: - if (hipToken.hipName == "hiprand_kernel.h") { - if (insertedRAND_kernelHeader) { return true; } + if (hipToken.hipName == s_hiprand_kernel_h) { + if (insertedRAND_kernelHeader) return true; insertedRAND_kernelHeader = true; return false; - } else if (hipToken.hipName == "hiprand.h") { - if (insertedRANDHeader) { return true; } + } else if (hipToken.hipName == s_hiprand_h) { + if (insertedRANDHeader) return true; insertedRANDHeader = true; return false; } case API_DNN: - if (insertedDNNHeader) { return true; } + if (insertedDNNHeader) return true; insertedDNNHeader = true; return false; case API_FFT: - if (insertedFFTHeader) { return true; } + if (insertedFFTHeader) return true; insertedFFTHeader = true; return false; case API_COMPLEX: - if (insertedComplexHeader) { return true; } + if (insertedComplexHeader) return true; insertedComplexHeader = true; return false; case API_SPARSE: - if (insertedSPARSEHeader) { return true; } + if (insertedSPARSEHeader) return true; insertedSPARSEHeader = true; return false; default: @@ -240,13 +246,11 @@ bool HipifyAction::Exclude(const hipCounter & hipToken) { } return false; case CONV_INCLUDE: - if (hipToken.hipName.empty()) { - return true; - } + if (hipToken.hipName.empty()) return true; switch (hipToken.apiType) { case API_RAND: - if (hipToken.hipName == "hiprand_kernel.h") { - if (insertedRAND_kernelHeader) { return true; } + if (hipToken.hipName == s_hiprand_kernel_h) { + if (insertedRAND_kernelHeader) return true; insertedRAND_kernelHeader = true; } return false; @@ -267,24 +271,19 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, clang::CharSourceRange filename_range, const clang::FileEntry*, StringRef, StringRef, const clang::Module*) { - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - if (!SM.isWrittenInMainFile(hash_loc)) { - return; - } + auto &SM = getCompilerInstance().getSourceManager(); + if (!SM.isWrittenInMainFile(hash_loc)) return; if (!firstHeader) { firstHeader = true; firstHeaderLoc = hash_loc; } const auto found = CUDA_INCLUDE_MAP.find(file_name); - if (found == CUDA_INCLUDE_MAP.end()) { - return; - } + if (found == CUDA_INCLUDE_MAP.end()) return; bool exclude = Exclude(found->second); Statistics::current().incrementCounter(found->second, file_name.str()); - clang::SourceLocation sl = filename_range.getBegin(); if (Statistics::isUnsupported(found->second)) { - clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics(); + clang::DiagnosticsEngine &DE = getCompilerInstance().getDiagnostics(); DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header.")); return; } @@ -293,11 +292,8 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, if (!exclude) { clang::SmallString<128> includeBuffer; llvm::StringRef name = Statistics::isToRoc(found->second) ? found->second.rocName : found->second.hipName; - if (is_angled) { - newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer); - } else { - newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer); - } + if (is_angled) newInclude = llvm::Twine("<" + name+ ">").toStringRef(includeBuffer); + else newInclude = llvm::Twine("\"" + name + "\"").toStringRef(includeBuffer); } else { // hashLoc is location of the '#', thus replacing the whole include directive by empty newInclude starting with '#'. sl = hash_loc; @@ -309,46 +305,33 @@ void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, } void HipifyAction::PragmaDirective(clang::SourceLocation Loc, clang::PragmaIntroducerKind Introducer) { - if (pragmaOnce) { - return; - } - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - if (!SM.isWrittenInMainFile(Loc)) { - return; - } - clang::Preprocessor& PP = getCompilerInstance().getPreprocessor(); + if (pragmaOnce) return; + auto &SM = getCompilerInstance().getSourceManager(); + if (!SM.isWrittenInMainFile(Loc)) return; + clang::Preprocessor &PP = getCompilerInstance().getPreprocessor(); clang::Token tok; PP.Lex(tok); StringRef Text(SM.getCharacterData(tok.getLocation()), tok.getLength()); - if (Text == "once") { + if (Text == sOnce) { pragmaOnce = true; pragmaOnceLoc = tok.getEndLoc(); } } -bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - StringRef refName = "cudaLaunchKernel"; - const auto* launchKernel = Result.Nodes.getNodeAs(refName); - if (!launchKernel) { - return false; - } - const clang::Expr* calleeExpr = launchKernel->getCallee(); - if (!calleeExpr) { - return false; - } - const clang::FunctionDecl *caleeDecl = launchKernel->getDirectCallee(); - if (!caleeDecl) { - return false; - } - const clang::CallExpr* config = launchKernel->getConfig(); - if (!config) { - return false; - } +bool HipifyAction::cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result) { + auto *launchKernel = Result.Nodes.getNodeAs(sCudaLaunchKernel); + if (!launchKernel) return false; + auto *calleeExpr = launchKernel->getCallee(); + if (!calleeExpr) return false; + auto *caleeDecl = launchKernel->getDirectCallee(); + if (!caleeDecl) return false; + auto *config = launchKernel->getConfig(); + if (!config) return false; clang::SmallString<40> XStr; llvm::raw_svector_ostream OS(XStr); clang::LangOptions DefaultLangOptions; - clang::SourceManager* SM = Result.SourceManager; - OS << sHipLaunchKernelGGL; + auto *SM = Result.SourceManager; + OS << sHipLaunchKernelGGL << "("; if (caleeDecl->isTemplateInstantiation()) OS << sHIP_KERNEL_NAME << "("; OS << readSourceText(*SM, calleeExpr->getSourceRange()); if (caleeDecl->isTemplateInstantiation()) OS << ")"; @@ -381,29 +364,24 @@ bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::Matc ct::Replacement Rep(*SM, launchStart, length, OS.str()); clang::FullSourceLoc fullSL(launchStart, *SM); insertReplacement(Rep, fullSL); - hipCounter counter = {"hipLaunchKernelGGL", "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME}; - Statistics::current().incrementCounter(counter, refName.str()); + hipCounter counter = {sHipLaunchKernelGGL, "", ConvTypes::CONV_KERNEL_LAUNCH, ApiTypes::API_RUNTIME}; + Statistics::current().incrementCounter(counter, sCudaLaunchKernel.str()); return true; } -bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - StringRef refName = "cudaSharedIncompleteArrayVar"; - auto* sharedVar = Result.Nodes.getNodeAs(refName); - if (!sharedVar) { - return false; - } +bool HipifyAction::cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result) { + auto *sharedVar = Result.Nodes.getNodeAs(sCudaSharedIncompleteArrayVar); + if (!sharedVar) return false; // Example: extern __shared__ uint sRadix1[]; - if (!sharedVar->hasExternalFormalLinkage()) { - return false; - } + if (!sharedVar->hasExternalFormalLinkage()) return false; clang::QualType QT = sharedVar->getType(); std::string typeName; if (QT->isIncompleteArrayType()) { - const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); + const clang::ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); QT = AT->getElementType(); if (QT.getTypePtr()->isBuiltinType()) { QT = QT.getCanonicalType(); - const auto* BT = clang::dyn_cast(QT); + auto *BT = clang::dyn_cast(QT); if (BT) { clang::LangOptions LO; LO.CUDA = true; @@ -417,7 +395,7 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match if (!typeName.empty()) { clang::SourceLocation slStart = sharedVar->getOuterLocStart(); clang::SourceLocation slEnd = llcompat::getEndLoc(sharedVar->getTypeSourceInfo()->getTypeLoc()); - clang::SourceManager* SM = Result.SourceManager; + auto *SM = Result.SourceManager; size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1; std::string varName = sharedVar->getNameAsString(); std::string repName = sHIP_DYNAMIC_SHARED + "(" + typeName + ", " + varName + ")"; @@ -425,52 +403,39 @@ bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::Match clang::FullSourceLoc fullSL(slStart, *SM); insertReplacement(Rep, fullSL); hipCounter counter = {sHIP_DYNAMIC_SHARED, "", ConvTypes::CONV_EXTERN_SHARED, ApiTypes::API_RUNTIME}; - Statistics::current().incrementCounter(counter, refName.str()); + Statistics::current().incrementCounter(counter, sCudaSharedIncompleteArrayVar.str()); return true; } return false; } -bool HipifyAction::cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - if (const clang::CallExpr *call = Result.Nodes.getNodeAs("cudaDeviceFuncCall")) { - const clang::FunctionDecl *funcDcl = call->getDirectCallee(); - if (!funcDcl) { - return false; - } +bool HipifyAction::cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result) { + if (const clang::CallExpr *call = Result.Nodes.getNodeAs(sCudaDeviceFuncCall)) { + auto *funcDcl = call->getDirectCallee(); + if (!funcDcl) return false; FindAndReplace(funcDcl->getDeclName().getAsString(), llcompat::getBeginLoc(call), CUDA_DEVICE_FUNC_MAP, false); return true; } return false; } -bool HipifyAction::cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - if (const clang::TypedefNameDecl *decl = Result.Nodes.getNodeAs("cubNamespacePrefix")) { - if (!decl) { - return false; - } +bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result) { + if (auto *decl = Result.Nodes.getNodeAs(sCubNamespacePrefix)) { clang::QualType QT = decl->getUnderlyingType(); - const clang::Type* t = QT.getTypePtr(); - if (!t) { - return false; - } - const clang::ElaboratedType* et = t->getAs(); - if (!et) { - return false; - } + auto *t = QT.getTypePtr(); + if (!t) return false; + const clang::ElaboratedType *et = t->getAs(); + if (!et) return false; const clang::NestedNameSpecifier *nns = et->getQualifier(); - if (!nns) { - return false; - } + if (!nns) return false; const clang::NamespaceDecl *nsd = nns->getAsNamespace(); - if (!nsd) { - return false; - } + if (!nsd) return false; const clang::TypeSourceInfo *si = decl->getTypeSourceInfo(); const clang::TypeLoc tloc = si->getTypeLoc(); const clang::SourceRange sr = tloc.getSourceRange(); clang::SourceLocation sl(sr.getBegin()); clang::SourceLocation end(sr.getEnd()); - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + auto &SM = getCompilerInstance().getSourceManager(); size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); StringRef sfull = StringRef(SM.getCharacterData(sl), length); std::string name = nsd->getDeclName().getAsString(); @@ -484,15 +449,11 @@ bool HipifyAction::cubNamespacePrefix(const clang::ast_matchers::MatchFinder::Ma return false; } -bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result) { - if (const clang::CallExpr * call = Result.Nodes.getNodeAs("cudaHostFuncCall")) { - if (!call->getNumArgs()) { - return false; - } - const clang::FunctionDecl* funcDcl = call->getDirectCallee(); - if (!funcDcl) { - return false; - } +bool HipifyAction::cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result) { + if (auto *call = Result.Nodes.getNodeAs(sCudaHostFuncCall)) { + if (!call->getNumArgs()) return false; + auto *funcDcl = call->getDirectCallee(); + if (!funcDcl) return false; std::string sName = funcDcl->getDeclName().getAsString(); unsigned int argNum = 0; bool b_reinterpret = (ReinterpretFunctions.find(sName) != ReinterpretFunctions.end()) ? true : false; @@ -506,7 +467,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc clang::SmallString<40> XStr; llvm::raw_svector_ostream OS(XStr); clang::SourceRange sr = call->getArg(argNum)->getSourceRange(); - clang::SourceManager* SM = Result.SourceManager; + auto *SM = Result.SourceManager; OS << (b_reinterpret ? s_reinterpret_cast : sHIP_SYMBOL) << "(" << readSourceText(*SM, sr) << ")"; clang::SourceRange replacementRange = getWriteRange(*SM, { sr.getBegin(), sr.getEnd() }); clang::SourceLocation s = replacementRange.getBegin(); @@ -521,7 +482,7 @@ bool HipifyAction::cudaHostFuncCall(const clang::ast_matchers::MatchFinder::Matc return false; } -void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) { +void HipifyAction::insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL) { llcompat::insertReplacement(*replacements, rep); if (PrintStats) { rep.getLength(); @@ -530,10 +491,10 @@ void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::Fu } } -std::unique_ptr HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) { - Finder.reset(new clang::ast_matchers::MatchFinder); +std::unique_ptr HipifyAction::CreateASTConsumer(clang::CompilerInstance &CI, StringRef) { + Finder.reset(new mat::MatchFinder); // Replace the <<<...>>> language extension with a hip kernel launch - Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this); + Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind(sCudaLaunchKernel), this); Finder->addMatcher( mat::varDecl( mat::isExpansionInMainFile(), @@ -541,7 +502,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi mat::hasAttr(clang::attr::CUDAShared), mat::hasType(mat::incompleteArrayType()) ) - ).bind("cudaSharedIncompleteArrayVar"), + ).bind(sCudaSharedIncompleteArrayVar), this ); Finder->addMatcher( @@ -561,7 +522,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi ) ) ) - ).bind("cudaHostFuncCall"), + ).bind(sCudaHostFuncCall), this ); Finder->addMatcher( @@ -576,7 +537,7 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi mat::unless(mat::hasAttr(clang::attr::CUDAHost)) ) ) - ).bind("cudaDeviceFuncCall"), + ).bind(sCudaDeviceFuncCall), this ); Finder->addMatcher( @@ -586,12 +547,12 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi mat::elaboratedType( mat::hasQualifier( mat::specifiesNamespace( - mat::hasName("cub") + mat::hasName(sCub) ) ) ) ) - ).bind("cubNamespacePrefix"), + ).bind(sCubNamespacePrefix), this ); // Ownership is transferred to the caller. @@ -599,10 +560,8 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi } void HipifyAction::Ifndef(clang::SourceLocation Loc, const clang::Token &MacroNameTok, const clang::MacroDefinition &MD) { - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - if (!SM.isWrittenInMainFile(Loc)) { - return; - } + auto &SM = getCompilerInstance().getSourceManager(); + if (!SM.isWrittenInMainFile(Loc)) return; StringRef Text(SM.getCharacterData(MacroNameTok.getLocation()), MacroNameTok.getLength()); Ifndefs.insert(std::make_pair(Text.str(), MacroNameTok.getEndLoc())); } @@ -615,12 +574,12 @@ void HipifyAction::EndSourceFileAction() { // one copy of the hip include into every file. bool placeForIncludeCalculated = false; clang::SourceLocation sl, controllingMacroLoc; - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); - clang::Preprocessor& PP = getCompilerInstance().getPreprocessor(); - clang::HeaderSearch& HS = PP.getHeaderSearchInfo(); - clang::ExternalPreprocessorSource* EPL = HS.getExternalLookup(); - const clang::FileEntry* FE = SM.getFileEntryForID(SM.getMainFileID()); - const clang::IdentifierInfo* controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL); + auto &SM = getCompilerInstance().getSourceManager(); + clang::Preprocessor &PP = getCompilerInstance().getPreprocessor(); + clang::HeaderSearch &HS = PP.getHeaderSearchInfo(); + clang::ExternalPreprocessorSource *EPL = HS.getExternalLookup(); + const clang::FileEntry *FE = SM.getFileEntryForID(SM.getMainFileID()); + const clang::IdentifierInfo *controllingMacro = HS.getFileInfo(FE).getControllingMacro(EPL); if (controllingMacro) { auto found = Ifndefs.find(controllingMacro->getName().str()); if (found != Ifndefs.end()) { @@ -629,19 +588,13 @@ void HipifyAction::EndSourceFileAction() { } } if (pragmaOnce) { - if (placeForIncludeCalculated) { - sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc; - } else { - sl = pragmaOnceLoc; - } + if (placeForIncludeCalculated) sl = pragmaOnceLoc < controllingMacroLoc ? pragmaOnceLoc : controllingMacroLoc; + else sl = pragmaOnceLoc; placeForIncludeCalculated = true; } if (!placeForIncludeCalculated) { - if (firstHeader) { - sl = firstHeaderLoc; - } else { - sl = SM.getLocForStartOfFile(SM.getMainFileID()); - } + if (firstHeader) sl = firstHeaderLoc; + else sl = SM.getLocForStartOfFile(SM.getMainFileID()); } clang::FullSourceLoc fullSL(sl, SM); ct::Replacement Rep(SM, sl, 0, "\n#include \n"); @@ -656,15 +609,15 @@ namespace { * A silly little class to proxy PPCallbacks back to the HipifyAction class. */ class PPCallbackProxy : public clang::PPCallbacks { - HipifyAction& hipifyAction; + HipifyAction &hipifyAction; public: - explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {} + explicit PPCallbackProxy(HipifyAction &action): hipifyAction(action) {} - void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token, + void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token, StringRef file_name, bool is_angled, clang::CharSourceRange filename_range, - const clang::FileEntry* file, StringRef search_path, StringRef relative_path, - const clang::Module* imported + const clang::FileEntry *file, StringRef search_path, StringRef relative_path, + const clang::Module *imported #if LLVM_VERSION_MAJOR > 6 , clang::SrcMgr::CharacteristicKind FileType #endif @@ -688,10 +641,10 @@ bool HipifyAction::BeginInvocation(clang::CompilerInstance &CI) { } void HipifyAction::ExecuteAction() { - clang::Preprocessor& PP = getCompilerInstance().getPreprocessor(); - clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + clang::Preprocessor &PP = getCompilerInstance().getPreprocessor(); + auto &SM = getCompilerInstance().getSourceManager(); // Start lexing the specified input file. - const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID()); + const llvm::MemoryBuffer *FromFile = SM.getBuffer(SM.getMainFileID()); clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts()); RawLex.SetKeepWhitespaceMode(true); // Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough @@ -709,7 +662,7 @@ void HipifyAction::ExecuteAction() { clang::ASTFrontendAction::ExecuteAction(); } -void HipifyAction::run(const clang::ast_matchers::MatchFinder::MatchResult& Result) { +void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) { if (cudaLaunchKernel(Result)) return; if (cudaSharedIncompleteArrayVar(Result)) return; if (cudaHostFuncCall(Result)) return; diff --git a/projects/clr/hipamd/hipify-clang/src/HipifyAction.h b/projects/clr/hipamd/hipify-clang/src/HipifyAction.h index a24404deee..78532449ee 100644 --- a/projects/clr/hipamd/hipify-clang/src/HipifyAction.h +++ b/projects/clr/hipamd/hipify-clang/src/HipifyAction.h @@ -31,17 +31,18 @@ THE SOFTWARE. #include "Statistics.h" namespace ct = clang::tooling; +namespace mat = clang::ast_matchers; using namespace llvm; /** * A FrontendAction that hipifies CUDA programs. */ class HipifyAction : public clang::ASTFrontendAction, - public clang::ast_matchers::MatchFinder::MatchCallback { + public mat::MatchFinder::MatchCallback { private: - ct::Replacements* replacements; + ct::Replacements *replacements; std::map Ifndefs; - std::unique_ptr Finder; + std::unique_ptr Finder; // CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent // ones, and track - using this flag - if the result led to us including the hip runtime header. If it did // not, we insert it at the top of the file when we finish processing it. @@ -67,12 +68,11 @@ public: explicit HipifyAction(ct::Replacements *replacements): clang::ASTFrontendAction(), replacements(replacements) {} // MatchCallback listeners - bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaDeviceFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cudaHostFuncCall(const clang::ast_matchers::MatchFinder::MatchResult& Result); - bool cubNamespacePrefix(const clang::ast_matchers::MatchFinder::MatchResult& Result); + bool cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result); + bool cudaSharedIncompleteArrayVar(const mat::MatchFinder::MatchResult &Result); + bool cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result); + bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result); + bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result); // Called by the preprocessor for each include directive during the non-raw lexing pass. void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token, @@ -91,7 +91,7 @@ public: protected: // Add a Replacement for the current file. These will all be applied after executing the FrontendAction. - void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL); + void insertReplacement(const ct::Replacement &rep, const clang::FullSourceLoc &fullSL); // FrontendAction entry point. void ExecuteAction() override; // Callback before starting processing a single input; used by hipify-clang for setting Preprocessor options. @@ -99,8 +99,8 @@ protected: // Called at the start of each new file to process. void EndSourceFileAction() override; // MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an interest for. - void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override; - std::unique_ptr CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override; - bool Exclude(const hipCounter & hipToken); - void FindAndReplace(llvm::StringRef name, clang::SourceLocation sl, const std::map& repMap, bool bReplace = true); + void run(const mat::MatchFinder::MatchResult &Result) override; + std::unique_ptr CreateASTConsumer(clang::CompilerInstance &CI, StringRef InFile) override; + bool Exclude(const hipCounter &hipToken); + void FindAndReplace(StringRef name, clang::SourceLocation sl, const std::map &repMap, bool bReplace = true); }; From d40dfe354a2a4083717e638fb20fbe7c61024fe6 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 15 Oct 2019 11:26:03 +0300 Subject: [PATCH 03/45] [HIPIFY][cmake] Make CMakeLists.txt compatible with default cmake 3.5.2 for Ubuntu 16.04 + Update README.md accordingly [ROCm/clr commit: 5dae577d674ac2c214c858e7ad05ab8fac418852] --- projects/clr/hipamd/hipify-clang/CMakeLists.txt | 10 +++------- projects/clr/hipamd/hipify-clang/README.md | 6 +++--- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/CMakeLists.txt b/projects/clr/hipamd/hipify-clang/CMakeLists.txt index 5eb71b3a3e..fd172a9c8d 100644 --- a/projects/clr/hipamd/hipify-clang/CMakeLists.txt +++ b/projects/clr/hipamd/hipify-clang/CMakeLists.txt @@ -1,8 +1,4 @@ -if (CUDA_VERSION VERSION_GREATER "9.2") - cmake_minimum_required(VERSION 3.12.3) -else() - cmake_minimum_required(VERSION 3.7.2) -endif() +cmake_minimum_required(VERSION 3.5.2) project(hipify-clang) @@ -58,7 +54,7 @@ target_link_libraries(hipify-clang PRIVATE LLVMOption LLVMCore) -if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "7") +if (LLVM_PACKAGE_VERSION VERSION_GREATER "6.0.1") target_link_libraries(hipify-clang PRIVATE clangToolingInclusions) endif() @@ -72,7 +68,7 @@ else() set(StdCpp "-std=c++") endif() -if (LLVM_PACKAGE_VERSION VERSION_GREATER_EQUAL "10.0") +if (LLVM_PACKAGE_VERSION VERSION_GREATER "9.0") string(APPEND StdCpp "14") # MSVC starting from 1900 (VS 2015) supports only the following c++ std values: c++14|c++17|c++latest elseif (MSVC) diff --git a/projects/clr/hipamd/hipify-clang/README.md b/projects/clr/hipamd/hipify-clang/README.md index 07466dbe62..bb9654d8b4 100644 --- a/projects/clr/hipamd/hipify-clang/README.md +++ b/projects/clr/hipamd/hipify-clang/README.md @@ -63,7 +63,7 @@ In most cases, you can get a suitable version of LLVM+CLANG with your package manager. Failing that or having multiple versions of LLVM, you can [download a release archive](http://releases.llvm.org/), build or install it, and set -[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.12/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist` +[CMAKE_PREFIX_PATH](https://cmake.org/cmake/help/v3.5/variable/CMAKE_PREFIX_PATH.html) so `cmake` can find it; for instance: `-DCMAKE_PREFIX_PATH=f:\LLVM\9.0.0\dist` ## Build and install @@ -196,7 +196,7 @@ Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38 Build system for the above configurations: -Python 2.7 (min), cmake 3.12.3 (min), GNU C/C++ 5.4.0 (min). +Python 2.7 (min), cmake 3.5.2 (min), GNU C/C++ 5.4.0 (min). Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`: @@ -345,7 +345,7 @@ LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn-7.0.5.15 - cudnn-7.6.4.38 Build system for the above configurations: -Python 3.6 - 3.7.4, cmake 3.12.3 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4). +Python 3.6 - 3.7.4, cmake 3.5.2 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4). Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`: From 4b0e9e9f05de65807f25d6c2bc44c57eb0a4db4f Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 15 Oct 2019 14:20:23 +0300 Subject: [PATCH 04/45] [HIPIFY][tests] Exclude tests for the libs, which are not defined in cmake command line + affects cuDNN and CUB tests, paths to libraries of which are defined by CUDA_DNN_ROOT_DIR and CUDA_CUB_ROOT_DIR + Warn about excluding and why, for instance: "WARN: cuDNN tests are excluded due to unset CUDA_DNN_ROOT_DIR" [ROCm/clr commit: c0f7d02cedf303d76856d692aada1003e2c7eba2] --- .../clr/hipamd/tests/hipify-clang/lit.cfg | 46 ++++++++++++++----- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/projects/clr/hipamd/tests/hipify-clang/lit.cfg b/projects/clr/hipamd/tests/hipify-clang/lit.cfg index 1d092a4327..89e17249f0 100644 --- a/projects/clr/hipamd/tests/hipify-clang/lit.cfg +++ b/projects/clr/hipamd/tests/hipify-clang/lit.cfg @@ -12,18 +12,31 @@ import lit.util site_cfg = lit_config.params.get('site_config', None) lit_config.load_config(config, site_cfg) -print(str("========================================")) +config.excludes = ['cmdparser.hpp'] +config.excludes.append('spatial_batch_norm_op.h') +config.excludes.append('common_cudnn.h') + +delimiter = "==============================================================="; +print(delimiter) print("CUDA " + config.cuda_version + " - will be used for testing") print("LLVM " + config.llvm_version + " - will be used for testing") print(platform.machine() + " - Platform architecture") print(platform.system() + " " + platform.release() + " - Platform OS") print(str(config.pointer_size * 8) + " - hipify-clang binary bitness") print(str(struct.calcsize("P") * 8) + " - python " + str(platform.python_version()) + " binary bitness") -print(str("========================================")) - -config.excludes = ['cmdparser.hpp'] -config.excludes.append('spatial_batch_norm_op.h') -config.excludes.append('common_cudnn.h') +print(delimiter) +warns = None +if not config.cuda_dnn_root: + config.excludes.append('cudnn_convolution_forward.cu') + config.excludes.append('cudnn_softmax.cu') + print("WARN: cuDNN tests are excluded due to unset CUDA_DNN_ROOT_DIR") + warns = True +if not config.cuda_cub_root: + config.excludes.append('cub_01.cu') + print("WARN: CUB tests are excluded due to unset CUDA_CUB_ROOT_DIR") + warns = True +if warns: + print(delimiter) if config.cuda_version_major == 7 and config.cuda_version_minor == 0: config.excludes.append('headers_test_09.cu') @@ -90,17 +103,26 @@ else: run_test_ext = ".sh" # CUDA SDK ROOT clang_arguments += " -isystem'%s'/samples/common/inc" -# cuDNN ROOT -clang_arguments += " -I'%s'/include" if config.pointer_size == 8: clang_arguments += " -D__LP64__" +# cuDNN ROOT +if config.cuda_dnn_root: + clang_arguments += " -I'%s'/include" # CUB ROOT -clang_arguments += " -I'%s'" +if config.cuda_cub_root: + clang_arguments += " -I'%s'" + +if config.cuda_dnn_root and config.cuda_cub_root: + config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root))) +elif config.cuda_dnn_root: + config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root))) +elif config.cuda_cub_root: + config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_cub_root))) +else: + config.substitutions.append(("%clang_args", clang_arguments % config.cuda_sdk_root)) hipify_arguments = "--cuda-path='%s'" - -config.substitutions.append(("%clang_args", clang_arguments % (config.cuda_sdk_root, config.cuda_dnn_root, config.cuda_cub_root))) -config.substitutions.append(("%hipify_args", hipify_arguments % (config.cuda_root))) +config.substitutions.append(("%hipify_args", hipify_arguments % config.cuda_root)) config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"')) config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"')) From e2c2025e3e7dd45543812b60b72e07679d947024 Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Tue, 15 Oct 2019 14:18:42 -0400 Subject: [PATCH 05/45] Update the declarations of hipMemsetD8, hipMemsetD8Async, hipMemsetD16, hipMemsetD16Async. These functions are type aware and take in as their third argument the number of elements in the buffer, not the buffer size. Change the name of this argument from sizeBytes to count to align with the above description. [ROCm/clr commit: 0200aa3a210cc52d5238936b182a9d80d87ed152] --- .../include/hip/hcc_detail/hip_runtime_api.h | 16 ++++++------- projects/clr/hipamd/src/hip_memory.cpp | 24 +++++++++---------- 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index c07d2ad9f1..b4402fd67a 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -1742,10 +1742,10 @@ hipError_t hipMemset(void* dst, int value, size_t sizeBytes); * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes); +hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t count); /** * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant @@ -1758,11 +1758,11 @@ hipError_t hipMemsetD8(hipDeviceptr_t dest, unsigned char value, size_t sizeByte * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @param[in] stream - Stream identifier * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t sizeBytes, hipStream_t stream __dparm(0)); +hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t count, hipStream_t stream __dparm(0)); /** * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant @@ -1770,10 +1770,10 @@ hipError_t hipMemsetD8Async(hipDeviceptr_t dest, unsigned char value, size_t siz * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes); +hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t count); /** * @brief Fills the first sizeBytes bytes of the memory area pointed to by dest with the constant @@ -1786,11 +1786,11 @@ hipError_t hipMemsetD16(hipDeviceptr_t dest, unsigned short value, size_t sizeBy * * @param[out] dst Data ptr to be filled * @param[in] constant value to be set - * @param[in] sizeBytes Data size in bytes + * @param[in] number of values to be set * @param[in] stream - Stream identifier * @return #hipSuccess, #hipErrorInvalidValue, #hipErrorNotInitialized */ -hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t sizeBytes, hipStream_t stream __dparm(0)); +hipError_t hipMemsetD16Async(hipDeviceptr_t dest, unsigned short value, size_t count, hipStream_t stream __dparm(0)); /** * @brief Fills the memory area pointed to by dest with the constant integer diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index ed1422fcda..48b83287f3 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -1935,15 +1935,15 @@ hipError_t hipMemset2DAsync(void* dst, size_t pitch, int value, size_t width, si return ihipLogStatus(e); }; -hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes) { - HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, sizeBytes); +hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t count) { + HIP_INIT_SPECIAL_API(hipMemsetD8, (TRACE_MCMD), dst, value, count); hipError_t e = hipSuccess; hipStream_t stream = hipStreamNull; stream = ihipSyncAndResolveStream(stream); if (stream) { - e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar); + e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar); stream->locked_wait(); } else { e = hipErrorInvalidValue; @@ -1951,23 +1951,23 @@ hipError_t hipMemsetD8(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes return ihipLogStatus(e); } -hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t sizeBytes , hipStream_t stream ) { - HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, sizeBytes, stream); +hipError_t hipMemsetD8Async(hipDeviceptr_t dst, unsigned char value, size_t count , hipStream_t stream ) { + HIP_INIT_SPECIAL_API(hipMemsetD8Async, (TRACE_MCMD), dst, value, count, stream); stream = ihipSyncAndResolveStream(stream); if (stream) { - return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeChar)); + return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeChar)); } else { return ihipLogStatus(hipErrorInvalidValue); } } -hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes){ - HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, sizeBytes); +hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t count){ + HIP_INIT_SPECIAL_API(hipMemsetD16, (TRACE_MCMD), dst, value, count); hipError_t e = hipSuccess; hipStream_t stream = ihipSyncAndResolveStream(hipStreamNull); if (stream) { - e = ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort); + e = ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort); if(hipSuccess == e) stream->locked_wait(); } else { @@ -1976,12 +1976,12 @@ hipError_t hipMemsetD16(hipDeviceptr_t dst, unsigned short value, size_t sizeByt return ihipLogStatus(e); } -hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t sizeBytes, hipStream_t stream ){ - HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, sizeBytes, stream); +hipError_t hipMemsetD16Async(hipDeviceptr_t dst, unsigned short value, size_t count, hipStream_t stream ){ + HIP_INIT_SPECIAL_API(hipMemsetD16Async, (TRACE_MCMD), dst, value, count, stream); stream = ihipSyncAndResolveStream(stream); if (stream) { - return ihipLogStatus(ihipMemset(dst, value, sizeBytes, stream, ihipMemsetDataTypeShort)); + return ihipLogStatus(ihipMemset(dst, value, count, stream, ihipMemsetDataTypeShort)); } else { return ihipLogStatus(hipErrorInvalidValue); } From e6f426dee3e46bbdb9406f242bdd81ad6c4e4bfb Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Tue, 15 Oct 2019 14:20:14 -0400 Subject: [PATCH 06/45] hipMemset2D test should pass only if both async and sync subtests pass. [ROCm/clr commit: 39e42d4056a96068a28f40db261f5a33d8afd8a2] --- .../clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp index b3bcf42222..73f3f5d415 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp @@ -113,8 +113,8 @@ int main(int argc, char *argv[]) hipCtxCreate(&context, 0, p_gpuDevice); bool testResult = false; - testResult = testhipMemset2D(memsetval, p_gpuDevice); - testResult = testhipMemset2DAsync(memsetval, p_gpuDevice); + testResult &= testhipMemset2D(memsetval, p_gpuDevice); + testResult &= testhipMemset2DAsync(memsetval, p_gpuDevice); hipCtxDestroy(context); if(testResult){ passed(); From d20c5251b19758eebca96e9093caf1da9f5b9034 Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Tue, 15 Oct 2019 14:24:04 -0400 Subject: [PATCH 07/45] Add async subtest to hipMemSet3D [ROCm/clr commit: c83b6adb3382a8c9c0a37b791705ef3f50b1304c] --- .../src/runtimeApi/memory/hipMemset3D.cpp | 55 ++++++++++++++++++- 1 file changed, 53 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp index 11bd656761..ac26280314 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp @@ -79,12 +79,63 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice) return testResult; } +bool testhipMemset3DAsync(int memsetval,int p_gpuDevice) +{ + size_t numH = 256; + size_t numW = 256; + size_t depth = 10; + size_t width = numW * sizeof(char); + size_t sizeElements = width * numH * depth; + size_t elements = numW* numH* depth; + + + printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice); + char *A_h; + bool testResult = true; + hipExtent extent = make_hipExtent(width, numH, depth); + hipPitchedPtr devPitchedPtr; + + HIPCHECK(hipMalloc3D(&devPitchedPtr, extent)); + A_h = (char*)malloc(sizeElements); + HIPASSERT(A_h != NULL); + for (size_t i=0; i Date: Tue, 15 Oct 2019 15:29:14 -0400 Subject: [PATCH 08/45] Update indentation in the hipMemset3D test. Replace all tabs with four spaces. [ROCm/clr commit: f402b6d01a8fcd35799f15906b73b32cac73f836] --- .../src/runtimeApi/memory/hipMemset3D.cpp | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp index ac26280314..ce2459a438 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp @@ -41,27 +41,27 @@ bool testhipMemset3D(int memsetval,int p_gpuDevice) size_t elements = numW* numH* depth; - printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice); + printf ("testhipMemset3D memsetval=%2x device=%d\n", memsetval, p_gpuDevice); char *A_h; bool testResult = true; hipExtent extent = make_hipExtent(width, numH, depth); hipPitchedPtr devPitchedPtr; HIPCHECK(hipMalloc3D(&devPitchedPtr, extent)); - A_h = (char*)malloc(sizeElements); - HIPASSERT(A_h != NULL); - for (size_t i=0; i Date: Tue, 15 Oct 2019 17:15:49 -0400 Subject: [PATCH 09/45] In the hipMemset2D and hipMemset3D tests synchronize with the default stream after performing an async memset. [ROCm/clr commit: cc5abec0929ca815691d2c8faae4b5ca3dcb367e] --- .../clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp | 7 ++++--- .../clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp | 5 +++-- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp index 73f3f5d415..449f0b6f78 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp @@ -45,7 +45,7 @@ bool testhipMemset2D(int memsetval,int p_gpuDevice) char *A_d; char *A_h; bool testResult = true; - HIPCHECK ( hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16) ); + HIPCHECK(hipMemAllocPitch((hipDeviceptr_t*)&A_d, &pitch_A, width , numH,16)); A_h = (char*)malloc(sizeElements); HIPASSERT(A_h != NULL); for (size_t i=0; i Date: Tue, 15 Oct 2019 21:47:33 -0400 Subject: [PATCH 10/45] Add -fhip-new-launch-api to hipcc for HIP/VDI [ROCm/clr commit: 739530d53b4758baf3ccbb31af408401046bed9e] --- projects/clr/hipamd/bin/hipcc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/projects/clr/hipamd/bin/hipcc b/projects/clr/hipamd/bin/hipcc index 35fbb54397..77b7b7ebdd 100755 --- a/projects/clr/hipamd/bin/hipcc +++ b/projects/clr/hipamd/bin/hipcc @@ -234,6 +234,8 @@ if ($HIP_PLATFORM eq "clang") { if ($HIP_RUNTIME eq "HCC" ) { $HSA_PATH=$ENV{'HSA_PATH'} // "$ROCM_PATH/hsa"; $HIPCXXFLAGS .= " -isystem $HSA_PATH/include"; + } else { + $HIPCXXFLAGS .= " -fhip-new-launch-api"; } } elsif ($HIP_PLATFORM eq "hcc") { From e7fb74b07f1ee0e59ddffe99d0507c1960cea4da Mon Sep 17 00:00:00 2001 From: Xiaozhu Meng Date: Wed, 16 Oct 2019 00:18:55 -0500 Subject: [PATCH 11/45] Fix struct declaration for C (#1524) This change is necessary for HPCToolkit to use Roctracer to produce code centric profiling view. [ROCm/clr commit: f9b8a01c779e9ebf298ad6af02f370ea16f2f8c1] --- projects/clr/hipamd/hip_prof_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/hip_prof_gen.py b/projects/clr/hipamd/hip_prof_gen.py index d1203a64d6..9e90c1558c 100755 --- a/projects/clr/hipamd/hip_prof_gen.py +++ b/projects/clr/hipamd/hip_prof_gen.py @@ -348,7 +348,7 @@ def generate_prof_header(f, api_map, opts_map): # Generating the callbacks data structure f.write('\n// HIP API callbacks data structure\n') f.write( - 'struct hip_api_data_t {\n' + + 'typedef struct hip_api_data_t {\n' + ' uint64_t correlation_id;\n' + ' uint32_t phase;\n' + ' union {\n' @@ -364,7 +364,7 @@ def generate_prof_header(f, api_map, opts_map): f.write(' } ' + name + ';\n') f.write( ' } args;\n' + - '};\n' + '} hip_api_data_t;\n' ) # Generating the callbacks args data filling macros From 1ec284d333812e29ac998f7438a260563b47602e Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com> Date: Wed, 16 Oct 2019 10:49:16 +0530 Subject: [PATCH 12/45] Adding code object manager to rtc (#1526) Adding Code Object Manager file to rtc to resolve address of Bundled_code_object in libhiprtc.so [ROCm/clr commit: b3351561c587554ceaaa2c62a936e03943dac5fa] --- projects/clr/hipamd/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/clr/hipamd/CMakeLists.txt b/projects/clr/hipamd/CMakeLists.txt index 0ad40a10ea..28b8683b22 100644 --- a/projects/clr/hipamd/CMakeLists.txt +++ b/projects/clr/hipamd/CMakeLists.txt @@ -289,7 +289,7 @@ if(HIP_PLATFORM STREQUAL "hcc") target_link_libraries(hip_hcc PRIVATE hc_am) target_link_libraries(hip_hcc_static PRIVATE hc_am) - add_library(hiprtc SHARED src/hiprtc.cpp) + add_library(hiprtc SHARED src/hiprtc.cpp src/code_object_bundle.cpp) target_include_directories( hiprtc SYSTEM PRIVATE ${PROJECT_SOURCE_DIR}/include ${HSA_PATH}/include) From a7d6c03e17a3f111139b3bcbd14c62acbf4cc44e Mon Sep 17 00:00:00 2001 From: Nick Curtis Date: Wed, 16 Oct 2019 00:19:56 -0500 Subject: [PATCH 13/45] Guard against division by zero for no VGPR usage (e.g., in an empty kernel) (#1528) * guard against division by zero for no VGPR usage (e.g., in an empty kernel) * fix bracket format * clean up parenthesis [ROCm/clr commit: d16963c9d5f906d74fe0636a785d9d02c6c20572] --- projects/clr/hipamd/src/hip_module.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index d98b98a378..b3afdd4ffe 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -1326,17 +1326,18 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( size_t numWavefronts = (blockSize + wavefrontSize - 1) / wavefrontSize; size_t availableVGPRs = (prop.regsPerBlock / wavefrontSize / simdPerCU); - size_t vgprs_alu_occupancy = simdPerCU * std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS); + size_t vgprs_alu_occupancy = simdPerCU * (usedVGPRS == 0 ? maxWavesPerSimd + : std::min(maxWavesPerSimd, availableVGPRs / usedVGPRS)); // Calculate blocks occupancy per CU based on VGPR usage *numBlocks = vgprs_alu_occupancy / numWavefronts; const size_t availableSGPRs = (prop.gcnArch < 800) ? 512 : 800; - size_t sgprs_alu_occupancy = simdPerCU * ((usedSGPRS == 0) ? maxWavesPerSimd + size_t sgprs_alu_occupancy = simdPerCU * (usedSGPRS == 0 ? maxWavesPerSimd : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); size_t total_used_lds = usedLDS + dynSharedMemPerBlk; if (total_used_lds != 0) { From 19f22b468bc8cb1c0157da65f94ff819c5af8ace Mon Sep 17 00:00:00 2001 From: kpyzhov Date: Wed, 16 Oct 2019 01:21:28 -0400 Subject: [PATCH 14/45] [hipcc] Temporary add -D_OPENMP to clang options to workaround cmake issue (#1540) * Temporary add -D_OPENMP to clang options in hipcc to allow using CMake OpenMP detection with hip-clang (until updated CMake version is available). [ROCm/clr commit: 9773f94c7191b43fb45855cc0cd1050895a3655c] --- projects/clr/hipamd/bin/hipcc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/projects/clr/hipamd/bin/hipcc b/projects/clr/hipamd/bin/hipcc index 35fbb54397..0f97bf0e21 100755 --- a/projects/clr/hipamd/bin/hipcc +++ b/projects/clr/hipamd/bin/hipcc @@ -526,6 +526,13 @@ foreach $arg (@ARGV) $optArg = $arg; } + ## This is a temporary workaround for CMake detection of OpenMP support. + ## It should be removed when the OpenMP detection c++ test in CMake is updated + ## and corrected CMake version is available. + if((defined $HIP_COMPILER) and ($HIP_COMPILER eq "clang") and ($arg eq '-fopenmp')) { + $HIPCXXFLAGS .= " -D_OPENMP " + } + ## process linker response file for hip-clang ## extract object files from static library and pass them directly to ## hip-clang in command line. From 4b8d8034cf2fb00afdb508a2c8a84d07bcb4614f Mon Sep 17 00:00:00 2001 From: vsytch Date: Wed, 16 Oct 2019 01:21:48 -0400 Subject: [PATCH 15/45] Update hipMathFunctions, hipTestHalf and hipTestNativeHalf tests to support Navi10 and Navi14. (#1545) [ROCm/clr commit: c2aadd4d12b2dfd17bb2963ad796f5b6087d4213] --- .../clr/hipamd/tests/src/deviceLib/hipMathFunctions.cpp | 7 ++++++- projects/clr/hipamd/tests/src/deviceLib/hipTestHalf.cpp | 7 ++++++- .../clr/hipamd/tests/src/deviceLib/hipTestNativeHalf.cpp | 7 ++++++- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipMathFunctions.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipMathFunctions.cpp index dc064da189..b1b0e8334a 100644 --- a/projects/clr/hipamd/tests/src/deviceLib/hipMathFunctions.cpp +++ b/projects/clr/hipamd/tests/src/deviceLib/hipMathFunctions.cpp @@ -29,7 +29,12 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ +#if __HIP_ARCH_GFX803__ || \ + __HIP_ARCH_GFX900__ || \ + __HIP_ARCH_GFX906__ || \ + __HIP_ARCH_GFX908__ || \ + __HIP_ARCH_GFX1010__ || \ + __HIP_ARCH_GFX1012__ __global__ void kernel_abs_int64(long long *input, long long *output) { int tx = threadIdx.x; diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipTestHalf.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipTestHalf.cpp index b78e1d8c63..751d44e242 100644 --- a/projects/clr/hipamd/tests/src/deviceLib/hipTestHalf.cpp +++ b/projects/clr/hipamd/tests/src/deviceLib/hipTestHalf.cpp @@ -28,7 +28,12 @@ THE SOFTWARE. #include "test_common.h" -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ +#if __HIP_ARCH_GFX803__ || \ + __HIP_ARCH_GFX900__ || \ + __HIP_ARCH_GFX906__ || \ + __HIP_ARCH_GFX908__ || \ + __HIP_ARCH_GFX1010__ || \ + __HIP_ARCH_GFX1012__ __device__ void test_convert() { __half x; diff --git a/projects/clr/hipamd/tests/src/deviceLib/hipTestNativeHalf.cpp b/projects/clr/hipamd/tests/src/deviceLib/hipTestNativeHalf.cpp index 55213d446d..8639127c7c 100644 --- a/projects/clr/hipamd/tests/src/deviceLib/hipTestNativeHalf.cpp +++ b/projects/clr/hipamd/tests/src/deviceLib/hipTestNativeHalf.cpp @@ -32,7 +32,12 @@ THE SOFTWARE. using namespace std; -#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ || __HIP_ARCH_GFX906__ || __HIP_ARCH_GFX908__ +#if __HIP_ARCH_GFX803__ || \ + __HIP_ARCH_GFX900__ || \ + __HIP_ARCH_GFX906__ || \ + __HIP_ARCH_GFX908__ || \ + __HIP_ARCH_GFX1010__ || \ + __HIP_ARCH_GFX1012__ __global__ void __halfTest(bool* result, __half a) { From 97c823d55284da5e44e1bae0ec8de28587494c3e Mon Sep 17 00:00:00 2001 From: kjayapra-amd <54370791+kjayapra-amd@users.noreply.github.com> Date: Wed, 16 Oct 2019 01:22:15 -0400 Subject: [PATCH 16/45] Use the correct return type in runTest in 11_texture_driver sample. (#1546) Fixes SWDEV-203394. Currently in runTest() returns true, even if the texture reference copy does not happen. Using the existing testResult Flag to return from runTest(). [ROCm/clr commit: 9d571e3c9ee7a0e64cd2b89c1ac15bbed6fdab2f] --- .../samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) mode change 100644 => 100755 projects/clr/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp diff --git a/projects/clr/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp b/projects/clr/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp old mode 100644 new mode 100755 index 2cb9877cac..b3c1ef5d0c --- a/projects/clr/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp +++ b/projects/clr/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define fileName "tex2dKernel.code" texture tex; -bool testResult = false; +bool testResult = true; #define HIP_CHECK(cmd) \ { \ @@ -126,7 +126,7 @@ bool runTest(int argc, char** argv) { } hipFree(dData); hipFreeArray(array); - return true; + return testResult; } int main(int argc, char** argv) { From 643a8bcf5bff879a706c741a8492aa83060a1271 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 16 Oct 2019 13:08:11 +0300 Subject: [PATCH 17/45] [HIPIFY][CUB][#1460] Implement cubFunctionTemplateDecl matcher + Add cub_02.cu test + Partial fixes #1460 [ROCm/clr commit: 5555d46e6663b303cad5f8ab6178bc2bf5522fb8] --- .../hipamd/hipify-clang/src/HipifyAction.cpp | 44 ++++++++++++ .../hipamd/hipify-clang/src/HipifyAction.h | 1 + .../unit_tests/libraries/CUB/cub_02.cu | 70 +++++++++++++++++++ 3 files changed, 115 insertions(+) create mode 100644 projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu diff --git a/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp b/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp index 930f3ec8c5..60288e7500 100644 --- a/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp +++ b/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp @@ -61,6 +61,7 @@ const StringRef sCudaLaunchKernel = "cudaLaunchKernel"; const StringRef sCudaHostFuncCall = "cudaHostFuncCall"; const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall"; const StringRef sCubNamespacePrefix = "cubNamespacePrefix"; +const StringRef sCubFunctionTemplateDecl = "cubFunctionTemplateDecl"; std::set DeviceSymbolFunctions0 { {sCudaMemcpyToSymbol}, @@ -449,6 +450,41 @@ bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Resul return false; } +bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result) { + if (auto *decl = Result.Nodes.getNodeAs(sCubFunctionTemplateDecl)) { + auto *Tparams = decl->getTemplateParameters(); + bool ret = false; + for (size_t I = 0; I < Tparams->size(); ++I) { + const clang::ValueDecl *valueDecl = dyn_cast(Tparams->getParam(I)); + if (!valueDecl) continue; + clang::QualType QT = valueDecl->getType(); + auto *t = QT.getTypePtr(); + if (!t) continue; + const clang::ElaboratedType *et = t->getAs(); + if (!et) continue; + const clang::NestedNameSpecifier *nns = et->getQualifier(); + if (!nns) continue; + const clang::NamespaceDecl *nsd = nns->getAsNamespace(); + if (!nsd) continue; + const clang::SourceRange sr = valueDecl->getSourceRange(); + clang::SourceLocation sl(sr.getBegin()); + clang::SourceLocation end(sr.getEnd()); + auto &SM = getCompilerInstance().getSourceManager(); + size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); + StringRef sfull = StringRef(SM.getCharacterData(sl), length); + std::string name = nsd->getDeclName().getAsString(); + size_t offset = sfull.find(name); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP); + ret = true; + } + return ret; + } + return false; +} + bool HipifyAction::cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result) { if (auto *call = Result.Nodes.getNodeAs(sCudaHostFuncCall)) { if (!call->getNumArgs()) return false; @@ -555,6 +591,13 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi ).bind(sCubNamespacePrefix), this ); + // TODO: Maybe worth to make it more concrete based on final cubFunctionTemplateDecl + Finder->addMatcher( + mat::functionTemplateDecl( + mat::isExpansionInMainFile() + ).bind(sCubFunctionTemplateDecl), + this + ); // Ownership is transferred to the caller. return Finder->newASTConsumer(); } @@ -668,4 +711,5 @@ void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) { if (cudaHostFuncCall(Result)) return; if (cudaDeviceFuncCall(Result)) return; if (cubNamespacePrefix(Result)) return; + if (cubFunctionTemplateDecl(Result)) return; } diff --git a/projects/clr/hipamd/hipify-clang/src/HipifyAction.h b/projects/clr/hipamd/hipify-clang/src/HipifyAction.h index 78532449ee..38a0deba9e 100644 --- a/projects/clr/hipamd/hipify-clang/src/HipifyAction.h +++ b/projects/clr/hipamd/hipify-clang/src/HipifyAction.h @@ -73,6 +73,7 @@ public: bool cudaDeviceFuncCall(const mat::MatchFinder::MatchResult &Result); bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result); bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result); + bool cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result); // Called by the preprocessor for each include directive during the non-raw lexing pass. void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token, diff --git a/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu b/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu new file mode 100644 index 0000000000..aff5def3fa --- /dev/null +++ b/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu @@ -0,0 +1,70 @@ +// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args +// CHECK: #include +#include +// CHECK: #include +#include +// CHECK: #include +#include + +#include + +template +__global__ void sort(const T* data_in, T* data_out){ + // CHECK: typedef hipcub::BlockLoad BlockLoadT; + typedef cub::BlockLoad BlockLoadT; + // CHECK: typedef hipcub::BlockRadixSort BlockRadixSortT; + typedef cub::BlockRadixSort BlockRadixSortT; + // CHECK: typedef hipcub::BlockStore BlockStoreT; + typedef cub::BlockStore BlockStoreT; + __shared__ union { + typename BlockLoadT::TempStorage load; + typename BlockRadixSortT::TempStorage sort; + typename BlockStoreT::TempStorage store; + } tmp_storage; + T items[ITEMS_PER_THREAD]; + BlockLoadT(tmp_storage.load).Load(data_in + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items); + __syncthreads(); + BlockRadixSortT(tmp_storage.sort).Sort(items); + __syncthreads(); + BlockStoreT(tmp_storage.store).Store(data_out + blockIdx.x * BLOCK_WIDTH * ITEMS_PER_THREAD, items); +} + +int main() { + double* d_gpu = NULL; + double* result_gpu = NULL; + double* data_sorted = new double[1000*4096]; + // Allocate memory on the GPU + // CHECK: hipMalloc(&d_gpu, 1000*4096 * sizeof(double)); + cudaMalloc(&d_gpu, 1000*4096 * sizeof(double)); + // CHECK: hipMalloc(&result_gpu, 1000*4096 * sizeof(double)); + cudaMalloc(&result_gpu, 1000*4096 * sizeof(double)); + // CHECK: hiprandGenerator_t gen; + curandGenerator_t gen; + // Create generator + // CHECK: hiprandCreateGenerator(&gen, HIPRAND_RNG_PSEUDO_DEFAULT); + curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT); + // Fill array with random numbers + // CHECK: hiprandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0); + curandGenerateNormalDouble(gen, d_gpu, 1000*4096, 0.0, 1.0); + // Destroy generator + // CHECK: hiprandDestroyGenerator(gen); + curandDestroyGenerator(gen); + // Sort data + // TODO: Substitution of cub namespace in CUDAKernelCallExpr + // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu); + sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE><<<1000, 512>>>(d_gpu, result_gpu); + // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu); + sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT><<<1000, 256>>>(d_gpu, result_gpu); + // CHECK: hipMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), hipMemcpyDeviceToHost); + cudaMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), cudaMemcpyDeviceToHost); + // Write the sorted data to standard out + for (int i = 0; i < 4095; ++i) { + std::cout << data_sorted[i] << ", "; + } + std::cout << data_sorted[4095] << std::endl; +} From 97f10790eb9eac7c064c2bee9a764d82229bd05f Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 16 Oct 2019 13:43:56 +0300 Subject: [PATCH 18/45] [HIPIFY] Refactor a couple of matcher functions + Separate out GetSubstrLocation function for finding substr SourceLocation in a given SourceRange [ROCm/clr commit: 0a200487598f1395746d3941650fbd0dd615e69b] --- .../hipamd/hipify-clang/src/HipifyAction.cpp | 35 ++++++++----------- .../hipamd/hipify-clang/src/HipifyAction.h | 2 ++ 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp b/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp index 60288e7500..269e66fed1 100644 --- a/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp +++ b/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp @@ -111,6 +111,19 @@ void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { } } +clang::SourceLocation HipifyAction::GetSubstrLocation(const std::string &str, const clang::SourceRange &sr) { + clang::SourceLocation sl(sr.getBegin()); + clang::SourceLocation end(sr.getEnd()); + auto &SM = getCompilerInstance().getSourceManager(); + size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); + StringRef sfull = StringRef(SM.getCharacterData(sl), length); + size_t offset = sfull.find(str); + if (offset > 0) { + sl = sl.getLocWithOffset(offset); + } + return sl; +} + /** * Look at, and consider altering, a given token. * @@ -434,17 +447,8 @@ bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Resul const clang::TypeSourceInfo *si = decl->getTypeSourceInfo(); const clang::TypeLoc tloc = si->getTypeLoc(); const clang::SourceRange sr = tloc.getSourceRange(); - clang::SourceLocation sl(sr.getBegin()); - clang::SourceLocation end(sr.getEnd()); - auto &SM = getCompilerInstance().getSourceManager(); - size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); - StringRef sfull = StringRef(SM.getCharacterData(sl), length); std::string name = nsd->getDeclName().getAsString(); - size_t offset = sfull.find(name); - if (offset > 0) { - sl = sl.getLocWithOffset(offset); - } - FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP); + FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP); return true; } return false; @@ -467,17 +471,8 @@ bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult & const clang::NamespaceDecl *nsd = nns->getAsNamespace(); if (!nsd) continue; const clang::SourceRange sr = valueDecl->getSourceRange(); - clang::SourceLocation sl(sr.getBegin()); - clang::SourceLocation end(sr.getEnd()); - auto &SM = getCompilerInstance().getSourceManager(); - size_t length = SM.getCharacterData(end) - SM.getCharacterData(sl); - StringRef sfull = StringRef(SM.getCharacterData(sl), length); std::string name = nsd->getDeclName().getAsString(); - size_t offset = sfull.find(name); - if (offset > 0) { - sl = sl.getLocWithOffset(offset); - } - FindAndReplace(name, sl, CUDA_CUB_TYPE_NAME_MAP); + FindAndReplace(name, GetSubstrLocation(name, sr), CUDA_CUB_TYPE_NAME_MAP); ret = true; } return ret; diff --git a/projects/clr/hipamd/hipify-clang/src/HipifyAction.h b/projects/clr/hipamd/hipify-clang/src/HipifyAction.h index 38a0deba9e..73879bfd14 100644 --- a/projects/clr/hipamd/hipify-clang/src/HipifyAction.h +++ b/projects/clr/hipamd/hipify-clang/src/HipifyAction.h @@ -63,6 +63,8 @@ private: void RewriteString(StringRef s, clang::SourceLocation start); // Replace a CUDA identifier with the corresponding hip identifier, if applicable. void RewriteToken(const clang::Token &t); + // Calculate str's SourceLocation in SourceRange sr + clang::SourceLocation GetSubstrLocation(const std::string &str, const clang::SourceRange &sr); public: explicit HipifyAction(ct::Replacements *replacements): clang::ASTFrontendAction(), From 577bac5de82437b88bf64d9f4aca545d021fd8cd Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Wed, 16 Oct 2019 11:02:38 -0400 Subject: [PATCH 19/45] hipMemset2D and hipMemset3D tests should be passing by default. [ROCm/clr commit: 86d0c5fa5a54a0d754a62a0d0e134f8733e70181] --- projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp | 2 +- projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp index 449f0b6f78..2eb62a859f 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset2D.cpp @@ -113,7 +113,7 @@ int main(int argc, char *argv[]) hipCtx_t context; hipCtxCreate(&context, 0, p_gpuDevice); - bool testResult = false; + bool testResult = true; testResult &= testhipMemset2D(memsetval, p_gpuDevice); testResult &= testhipMemset2DAsync(memsetval, p_gpuDevice); hipCtxDestroy(context); diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp index a47b609c73..1917559f2a 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemset3D.cpp @@ -134,7 +134,7 @@ int main(int argc, char *argv[]) { HipTest::parseStandardArguments(argc, argv, true); HIPCHECK(hipSetDevice(p_gpuDevice)); - bool testResult = false; + bool testResult = true; testResult &= testhipMemset3D(memsetval, p_gpuDevice); testResult &= testhipMemset3DAsync(memsetval, p_gpuDevice); if (testResult) { From b35730161064ac2ca8887787ca57537257d906d3 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 16 Oct 2019 19:02:13 +0300 Subject: [PATCH 20/45] [HIPIFY][CUB][#1460] Add cub:: namespace support in TemplateInstantiation of cudaLaunchKernel + Update cub_02.cu test accordingly [ROCm/clr commit: e557563947836a1bba07676bfc51ea074a66a8e0] --- .../hipamd/hipify-clang/src/HipifyAction.cpp | 22 +++++++++++++++++-- .../unit_tests/libraries/CUB/cub_02.cu | 5 ++--- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp b/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp index 269e66fed1..f63da1b2de 100644 --- a/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp +++ b/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp @@ -36,6 +36,7 @@ THE SOFTWARE. const std::string sHIP = "HIP"; const std::string sROC = "ROC"; const std::string sCub = "cub"; +const std::string sHipcub = "hipcub"; const std::string sHIP_DYNAMIC_SHARED = "HIP_DYNAMIC_SHARED"; const std::string sHIP_KERNEL_NAME = "HIP_KERNEL_NAME"; std::string sHIP_SYMBOL = "HIP_SYMBOL"; @@ -345,9 +346,26 @@ bool HipifyAction::cudaLaunchKernel(const mat::MatchFinder::MatchResult &Result) llvm::raw_svector_ostream OS(XStr); clang::LangOptions DefaultLangOptions; auto *SM = Result.SourceManager; + clang::SourceRange sr = calleeExpr->getSourceRange(); + std::string kern = readSourceText(*SM, sr).str(); OS << sHipLaunchKernelGGL << "("; - if (caleeDecl->isTemplateInstantiation()) OS << sHIP_KERNEL_NAME << "("; - OS << readSourceText(*SM, calleeExpr->getSourceRange()); + if (caleeDecl->isTemplateInstantiation()) { + OS << sHIP_KERNEL_NAME << "("; + std::string cub = sCub + "::"; + std::string hipcub; + const auto found = CUDA_CUB_TYPE_NAME_MAP.find(sCub); + if (found != CUDA_CUB_TYPE_NAME_MAP.end()) { + hipcub = found->second.hipName.str() + "::"; + } else { + hipcub = sHipcub + "::"; + } + size_t pos = kern.find(cub); + while (pos != std::string::npos) { + kern.replace(pos, cub.size(), hipcub); + pos = kern.find(cub, pos + hipcub.size()); + } + } + OS << kern; if (caleeDecl->isTemplateInstantiation()) OS << ")"; OS << ", "; // Next up are the four kernel configuration parameters, the last two of which are optional and default to zero. diff --git a/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu b/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu index aff5def3fa..21898baa03 100644 --- a/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu +++ b/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_02.cu @@ -55,10 +55,9 @@ int main() { // CHECK: hiprandDestroyGenerator(gen); curandDestroyGenerator(gen); // Sort data - // TODO: Substitution of cub namespace in CUDAKernelCallExpr - // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu); + // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<512, 8, hipcub::BLOCK_LOAD_TRANSPOSE, hipcub::BLOCK_STORE_TRANSPOSE>), dim3(1000), dim3(512), 0, 0, d_gpu, result_gpu); sort<512, 8, cub::BLOCK_LOAD_TRANSPOSE, cub::BLOCK_STORE_TRANSPOSE><<<1000, 512>>>(d_gpu, result_gpu); - // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu); + // CHECK: hipLaunchKernelGGL(HIP_KERNEL_NAME(sort<256, 16, hipcub::BLOCK_LOAD_DIRECT, hipcub::BLOCK_STORE_DIRECT>), dim3(1000), dim3(256), 0, 0, d_gpu, result_gpu); sort<256, 16, cub::BLOCK_LOAD_DIRECT, cub::BLOCK_STORE_DIRECT><<<1000, 256>>>(d_gpu, result_gpu); // CHECK: hipMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), hipMemcpyDeviceToHost); cudaMemcpy(data_sorted, result_gpu, 1000*4096*sizeof(double), cudaMemcpyDeviceToHost); From c8238e1fd4fd971b38142d4b13890ba46272da4b Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 17 Oct 2019 15:05:55 +0300 Subject: [PATCH 21/45] [HIPIFY][cmake] Add install rule for clang-resource-headers + Fix: set destination for all installing files to ${CMAKE_INSTALL_PREFIX} [ROCm/clr commit: 8c3dff7ab9aa79caf6d1b08b8ada7ca3f1e33c41] --- projects/clr/hipamd/hipify-clang/CMakeLists.txt | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/hipify-clang/CMakeLists.txt b/projects/clr/hipamd/hipify-clang/CMakeLists.txt index fd172a9c8d..bf4ef6872d 100644 --- a/projects/clr/hipamd/hipify-clang/CMakeLists.txt +++ b/projects/clr/hipamd/hipify-clang/CMakeLists.txt @@ -80,7 +80,21 @@ endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CFLAGS} ${StdCpp} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DIRS}/clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}\\\"") -install(TARGETS hipify-clang DESTINATION bin) +install(TARGETS hipify-clang DESTINATION ${CMAKE_INSTALL_PREFIX}) + +install( + DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/ + DESTINATION ${CMAKE_INSTALL_PREFIX} + COMPONENT clang-resource-headers + FILES_MATCHING + PATTERN "*.h" + PATTERN "*.modulemap" + PATTERN "algorithm" + PATTERN "complex" + PATTERN "new" + PATTERN "ppc_wrappers" EXCLUDE + PATTERN "openmp_wrappers" EXCLUDE + ) if (HIPIFY_CLANG_TESTS) find_package(PythonInterp 2.7 REQUIRED) From 714314fa661725233f349db57bcfefc75d89d8a4 Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 17 Oct 2019 09:11:32 -0700 Subject: [PATCH 22/45] Revert "hipcc defaults to code object v3 (#1298)" This reverts commit e5a2ba9602a466fd5c151acf8fc5cec178ec5adc. [ROCm/clr commit: 446718f990bc591ecacd693e91754035dcbaba2a] --- projects/clr/hipamd/bin/hipcc | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/projects/clr/hipamd/bin/hipcc b/projects/clr/hipamd/bin/hipcc index 0f97bf0e21..3686d51abe 100755 --- a/projects/clr/hipamd/bin/hipcc +++ b/projects/clr/hipamd/bin/hipcc @@ -352,7 +352,6 @@ my $runCmd = 1; my $buildDeps = 0; my $linkType = 1; my $setLinkType = 0; -my $coFormatv3 = 1; my @options = (); my @inputs = (); @@ -472,22 +471,6 @@ foreach $arg (@ARGV) $swallowArg = 1; } - # code object format parsing - if ($trimarg eq '-mcode-object-v3') { - $coFormatv3 = 1; - # hip-clang already recognizes -mcode-object-v3, so we just pass it on - if ($HIP_PLATFORM eq 'hcc') { - $swallowArg = 1; - } - } - if ($trimarg eq '-mno-code-object-v3') { - $coFormatv3 = 0; - # hip-clang already recognizes -mno-code-object-v3, so we just pass it on - if ($HIP_PLATFORM eq 'hcc') { - $swallowArg = 1; - } - } - if (($arg =~ /--genco/) and $HIP_PLATFORM eq 'clang' ) { $arg = "--cuda-device-only"; } @@ -870,12 +853,6 @@ if($HIP_PLATFORM eq "hcc" or $HIP_PLATFORM eq "clang"){ } } -# hcc defaults to v2, so we need to convert to the appropriate flag -# hip-clang defaults to v3, so we don't need to do anything -if ($coFormatv3 and $HIP_PLATFORM eq 'hcc') { - $HIPLDFLAGS .= " -Wl,-hcc-cov3 "; -} - if ($hasC and $HIP_PLATFORM eq 'nvcc') { $HIPCXXFLAGS .= " -x cu"; } From 9fb60fa36aec49cffa4bacd3fd06f0ddd7ff774c Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Thu, 17 Oct 2019 22:26:48 +0300 Subject: [PATCH 23/45] [HIPIFY][doc] Update README.md + Versions, testing [ROCm/clr commit: 1165e6bd718c424eaeb2f6d98f58e810f3a9cfd2] --- projects/clr/hipamd/hipify-clang/README.md | 155 +++++++++++---------- 1 file changed, 78 insertions(+), 77 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/README.md b/projects/clr/hipamd/hipify-clang/README.md index bb9654d8b4..f53a87f95c 100644 --- a/projects/clr/hipamd/hipify-clang/README.md +++ b/projects/clr/hipamd/hipify-clang/README.md @@ -154,7 +154,7 @@ To run it: * Path to CUB should be specified by the `CUDA_CUB_ROOT_DIR` option: - - Linux: `-DCUDA_CUB_ROOT_DIR=/srv/CUB` + - Linux: `-DCUDA_CUB_ROOT_DIR=/srv/git/CUB` - Windows: `-DCUDA_CUB_ROOT_DIR=f:/GIT/cub` @@ -194,9 +194,9 @@ Ubuntu 14: LLVM 5.0.0 - 6.0.1, CUDA 7.0 - 9.0, cudnn-5.0.5 - cudnn-7.6.4.38 Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38 -Build system for the above configurations: +Minimum build system requirements for the above configurations: -Python 2.7 (min), cmake 3.5.2 (min), GNU C/C++ 5.4.0 (min). +Python 2.7, cmake 3.5.2, GNU C/C++ 5.4.0. Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`: @@ -208,7 +208,7 @@ cmake -DCMAKE_PREFIX_PATH=/srv/git/LLVM/9.0.0/dist \ -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-10.1 \ -DCUDA_DNN_ROOT_DIR=/srv/CUDNN/cudnn-10.1-v7.6.4.38 \ - -DCUDA_CUB_ROOT_DIR=/srv/CUB \ + -DCUDA_CUB_ROOT_DIR=/srv/git/CUB \ -DLLVM_EXTERNAL_LIT=/srv/git/LLVM/9.0.0/build/bin/llvm-lit \ .. ``` @@ -264,88 +264,89 @@ Linux 5.2.0 - Platform OS 64 - hipify-clang binary bitness 64 - python 2.7.12 binary bitness ======================================== --- Testing: 64 tests, 12 threads -- -PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 64) -PASS: hipify :: unit_tests/device/math_functions.cu (2 of 64) -PASS: hipify :: unit_tests/device/atomics.cu (3 of 64) -PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 64) -PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 64) -PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 64) -PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 64) -PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 64) -PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 64) -PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 64) -PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 64) -PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 64) -PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 64) -PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 64) -PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 64) -PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 64) -PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 64) -PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 64) -PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 64) -PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 64) -PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 64) -PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 64) -PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 64) -PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 64) -PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 64) -PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 64) -PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 64) -PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 64) -PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 64) -PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 64) -PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 64) -PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 64) -PASS: hipify :: unit_tests/samples/allocators.cu (56 of 64) -PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 64) -PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 64) -PASS: hipify :: unit_tests/samples/axpy.cu (59 of 64) -PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 64) -PASS: hipify :: unit_tests/samples/intro.cu (61 of 64) -PASS: hipify :: unit_tests/samples/square.cu (62 of 64) -PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 64) -PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 64) -Testing Time: 2.98s - Expected Passes : 64 +-- Testing: 65 tests, 12 threads -- +PASS: hipify :: unit_tests/casts/reinterpret_cast.cu (1 of 65) +PASS: hipify :: unit_tests/device/math_functions.cu (2 of 65) +PASS: hipify :: unit_tests/device/atomics.cu (3 of 65) +PASS: hipify :: unit_tests/device/device_symbols.cu (4 of 65) +PASS: hipify :: unit_tests/headers/headers_test_02.cu (5 of 65) +PASS: hipify :: unit_tests/headers/headers_test_03.cu (6 of 65) +PASS: hipify :: unit_tests/headers/headers_test_01.cu (7 of 65) +PASS: hipify :: unit_tests/headers/headers_test_04.cu (8 of 65) +PASS: hipify :: unit_tests/headers/headers_test_05.cu (9 of 65) +PASS: hipify :: unit_tests/headers/headers_test_07.cu (10 of 65) +PASS: hipify :: unit_tests/headers/headers_test_06.cu (11 of 65) +PASS: hipify :: unit_tests/headers/headers_test_11.cu (12 of 65) +PASS: hipify :: unit_tests/headers/headers_test_08.cu (13 of 65) +PASS: hipify :: unit_tests/headers/headers_test_10.cu (14 of 65) +PASS: hipify :: unit_tests/headers/headers_test_09.cu (15 of 65) +PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_02.cu (16 of 65) +PASS: hipify :: unit_tests/libraries/CAFFE2/caffe2_01.cu (17 of 65) +PASS: hipify :: unit_tests/libraries/CUB/cub_01.cu (18 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_1_based_indexing.cu (19 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_0_based_indexing.cu (20 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/cublas_sgemm_matrix_multiplication.cu (21 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_0_based_indexing_rocblas.cu (22 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_sgemm_matrix_multiplication_rocblas.cu (23 of 65) +PASS: hipify :: unit_tests/libraries/cuBLAS/rocBLAS/cublas_1_based_indexing_rocblas.cu (24 of 65) +PASS: hipify :: unit_tests/libraries/cuComplex/cuComplex_Julia.cu (25 of 65) +PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_softmax.cu (26 of 65) +PASS: hipify :: unit_tests/libraries/cuDNN/cudnn_convolution_forward.cu (27 of 65) +PASS: hipify :: unit_tests/libraries/cuFFT/simple_cufft.cu (28 of 65) +PASS: hipify :: unit_tests/libraries/cuRAND/poisson_api_example.cu (29 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_01.cu (30 of 65) +PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_generate.cpp (31 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_02.cu (32 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_03.cu (33 of 65) +PASS: hipify :: unit_tests/libraries/cuRAND/benchmark_curand_kernel.cpp (34 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_04.cu (35 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_05.cu (36 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_06.cu (37 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_07.cu (38 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_09.cu (39 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_10.cu (40 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_08.cu (41 of 65) +PASS: hipify :: unit_tests/namespace/ns_kernel_launch.cu (42 of 65) +PASS: hipify :: unit_tests/pp/pp_if_else_conditionals.cu (43 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_11.cu (44 of 65) +PASS: hipify :: unit_tests/libraries/cuSPARSE/cuSPARSE_12.cu (45 of 65) +PASS: hipify :: unit_tests/pp/pp_if_else_conditionals_01.cu (46 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/tex2dKernel.cpp (47 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/0_MatrixTranspose/MatrixTranspose.cpp (48 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp (49 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/13_occupancy/occupancy.cpp (50 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/1_hipEvent/hipEvent.cpp (51 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/2_Profiler/Profiler.cpp (52 of 65) +PASS: hipify :: unit_tests/samples/MallocManaged.cpp (53 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/7_streams/stream.cpp (54 of 65) +PASS: hipify :: unit_tests/samples/2_Cookbook/8_peer2peer/peer2peer.cpp (55 of 65) +PASS: hipify :: unit_tests/samples/allocators.cu (56 of 65) +PASS: hipify :: unit_tests/samples/coalescing.cu (57 of 65) +PASS: hipify :: unit_tests/samples/dynamic_shared_memory.cu (58 of 65) +PASS: hipify :: unit_tests/samples/axpy.cu (59 of 65) +PASS: hipify :: unit_tests/samples/cudaRegister.cu (60 of 65) +PASS: hipify :: unit_tests/samples/intro.cu (61 of 65) +PASS: hipify :: unit_tests/samples/square.cu (62 of 65) +PASS: hipify :: unit_tests/samples/static_shared_memory.cu (63 of 65) +PASS: hipify :: unit_tests/samples/vec_add.cu (64 of 65) +PASS: hipify :: unit_tests/libraries/CUB/cub_02.cu (18 of 65) +Testing Time: 3.01s + Expected Passes : 65 [100%] Built target test-hipify ``` ### Windows On Windows 10 the following configurations are tested: -LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn-5.1.10 - cudnn-7.1.4.18 +LLVM 5.0.0 - 5.0.2, CUDA 8.0, cudnn 5.1.10 - 7.1.4.18 -LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn-7.0.5.15 - cudnn-7.6.4.38 +LLVM 6.0.0 - 6.0.1, CUDA 9.0, cudnn 7.0.5.15 - 7.6.4.38 -LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn-7.0.5.15 - cudnn-7.6.4.38 +LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn 7.0.5.15 - 7.6.4.38 -Build system for the above configurations: +Build system requirements for the latest configuration LLVM 9.0.0/CUDA 10.1 Update 2: -Python 3.6 - 3.7.4, cmake 3.5.2 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.4). +Python 3.6.0 - 3.8.0, cmake 3.5.2 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5). Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`: @@ -371,7 +372,7 @@ cmake -- - CMake module path: F:/LLVM/9.0.0/dist/lib/cmake/llvm -- - Include path : F:/LLVM/9.0.0/dist/include -- - Binary path : F:/LLVM/9.0.0/dist/bin --- Found PythonInterp: C:/Program Files/Python37/python.exe (found suitable version "3.7.4", minimum required is "3.6") +-- Found PythonInterp: C:/Program Files/Python38/python.exe (found suitable version "3.8.0", minimum required is "3.6") -- Found lit: C:/Program Files/Python36/Scripts/lit.exe -- Found FileCheck: F:/LLVM/9.0.0/dist/bin/FileCheck.exe -- Found CUDA: C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.1 (found version "10.1") From 489e3dda9aa87d91d29947acc4d6b0d01cad06e1 Mon Sep 17 00:00:00 2001 From: Aaron Enye Shi Date: Thu, 17 Oct 2019 21:21:24 +0000 Subject: [PATCH 24/45] [HIPIFY][cmake] Make CMakeLists use default 3.5.1 for Ubuntu 16.04 [ROCm/clr commit: b3ea58abe712017346c148df3e3f9825192f013c] --- projects/clr/hipamd/hipify-clang/CMakeLists.txt | 2 +- projects/clr/hipamd/hipify-clang/README.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/CMakeLists.txt b/projects/clr/hipamd/hipify-clang/CMakeLists.txt index bf4ef6872d..25076a8895 100644 --- a/projects/clr/hipamd/hipify-clang/CMakeLists.txt +++ b/projects/clr/hipamd/hipify-clang/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required(VERSION 3.5.2) +cmake_minimum_required(VERSION 3.5.1) project(hipify-clang) diff --git a/projects/clr/hipamd/hipify-clang/README.md b/projects/clr/hipamd/hipify-clang/README.md index f53a87f95c..da5abc19da 100644 --- a/projects/clr/hipamd/hipify-clang/README.md +++ b/projects/clr/hipamd/hipify-clang/README.md @@ -196,7 +196,7 @@ Ubuntu 16-18: LLVM 8.0.0 - 9.0.0, CUDA 8.0 - 10.1, cudnn-5.1.10 - cudnn-7.6.4.38 Minimum build system requirements for the above configurations: -Python 2.7, cmake 3.5.2, GNU C/C++ 5.4.0. +Python 2.7, cmake 3.5.1, GNU C/C++ 5.4.0. Here is an example of building `hipify-clang` with testing support on `Ubuntu 16.04`: @@ -346,7 +346,7 @@ LLVM 7.0.0 - 9.0.0, CUDA 7.5 - 10.1, cudnn 7.0.5.15 - 7.6.4.38 Build system requirements for the latest configuration LLVM 9.0.0/CUDA 10.1 Update 2: -Python 3.6.0 - 3.8.0, cmake 3.5.2 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5). +Python 3.6.0 - 3.8.0, cmake 3.5.1 - 3.15.5, Visual Studio 2017 (15.5.2) - 2019 (16.3.5). Here is an example of building `hipify-clang` with testing support on `Windows 10` by `Visual Studio 16 2019`: From 54eddfc8f09d22e5205e538dcb6c20a0b26ae2aa Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Thu, 17 Oct 2019 18:58:32 -0400 Subject: [PATCH 25/45] _aligned_malloc() on Windows first takes size, then alignment, which is the opposite of how the similar function behaves on Linux. Memory allocated by it also has to be freed using _aligned_free(), unlike Linux where we can use regular free(). Edit aligned_alloc() macro and add a aligned_free() one to align with the above behaviour. [ROCm/clr commit: f4440817cbfc83c2fea31442d939bf8eabeecb92] --- .../hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp | 4 ++-- projects/clr/hipamd/tests/src/test_common.h | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp index 8ce02d6164..4aacfa866d 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/memory/hipMemcpy_simple.cpp @@ -120,8 +120,8 @@ void simpleTest2(size_t numElements, bool usePinnedHost) { HIPCHECK(hipHostFree(A_h1)); HIPCHECK(hipHostFree(A_h2)); } else { - free(A_h1); - free(A_h2); + aligned_free(A_h1); + aligned_free(A_h2); } } diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index 73a952b0d1..dd77cd64a3 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -99,11 +99,15 @@ THE SOFTWARE. #ifdef _WIN64 #include -#define aligned_alloc _aligned_malloc +#define aligned_alloc(x,y) _aligned_malloc(y,x) +#define aligned_free(x) _aligned_free(x) #define popen(x,y) _popen(x,y) #define pclose(x) _pclose(x) #define setenv(x,y,z) _putenv_s(x,y) #endif +#else +#define aligned_free(x) free(x) +#endif // standard command-line variables: extern size_t N; From 85281b1d8657cf105ef456167e8a364640c93c1e Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 18 Oct 2019 18:51:40 +0300 Subject: [PATCH 26/45] [HIPIFY][CUB][#1460] Add "using namespace cub" translation support + Add cub_03.cu [ROCm/clr commit: 86f6756b0281a9136186c0eb8828a904ebd5f284] --- .../hipamd/hipify-clang/src/HipifyAction.cpp | 19 ++++++++++ .../hipamd/hipify-clang/src/HipifyAction.h | 1 + .../unit_tests/libraries/CUB/cub_03.cu | 37 +++++++++++++++++++ 3 files changed, 57 insertions(+) create mode 100644 projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu diff --git a/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp b/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp index f63da1b2de..510d91978a 100644 --- a/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp +++ b/projects/clr/hipamd/hipify-clang/src/HipifyAction.cpp @@ -63,6 +63,7 @@ const StringRef sCudaHostFuncCall = "cudaHostFuncCall"; const StringRef sCudaDeviceFuncCall = "cudaDeviceFuncCall"; const StringRef sCubNamespacePrefix = "cubNamespacePrefix"; const StringRef sCubFunctionTemplateDecl = "cubFunctionTemplateDecl"; +const StringRef sCubUsingNamespaceDecl = "cubUsingNamespaceDecl"; std::set DeviceSymbolFunctions0 { {sCudaMemcpyToSymbol}, @@ -472,6 +473,16 @@ bool HipifyAction::cubNamespacePrefix(const mat::MatchFinder::MatchResult &Resul return false; } +bool HipifyAction::cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result) { + if (auto *decl = Result.Nodes.getNodeAs(sCubUsingNamespaceDecl)) { + if (auto nsd = decl->getNominatedNamespace()) { + FindAndReplace(nsd->getDeclName().getAsString(), decl->getIdentLocation(), CUDA_CUB_TYPE_NAME_MAP); + return true; + } + } + return false; +} + bool HipifyAction::cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result) { if (auto *decl = Result.Nodes.getNodeAs(sCubFunctionTemplateDecl)) { auto *Tparams = decl->getTemplateParameters(); @@ -611,6 +622,13 @@ std::unique_ptr HipifyAction::CreateASTConsumer(clang::Compi ).bind(sCubFunctionTemplateDecl), this ); + // TODO: Maybe worth to make it more concrete + Finder->addMatcher( + mat::usingDirectiveDecl( + mat::isExpansionInMainFile() + ).bind(sCubUsingNamespaceDecl), + this + ); // Ownership is transferred to the caller. return Finder->newASTConsumer(); } @@ -725,4 +743,5 @@ void HipifyAction::run(const mat::MatchFinder::MatchResult &Result) { if (cudaDeviceFuncCall(Result)) return; if (cubNamespacePrefix(Result)) return; if (cubFunctionTemplateDecl(Result)) return; + if (cubUsingNamespaceDecl(Result)) return; } diff --git a/projects/clr/hipamd/hipify-clang/src/HipifyAction.h b/projects/clr/hipamd/hipify-clang/src/HipifyAction.h index 73879bfd14..f70d17dd0b 100644 --- a/projects/clr/hipamd/hipify-clang/src/HipifyAction.h +++ b/projects/clr/hipamd/hipify-clang/src/HipifyAction.h @@ -76,6 +76,7 @@ public: bool cudaHostFuncCall(const mat::MatchFinder::MatchResult &Result); bool cubNamespacePrefix(const mat::MatchFinder::MatchResult &Result); bool cubFunctionTemplateDecl(const mat::MatchFinder::MatchResult &Result); + bool cubUsingNamespaceDecl(const mat::MatchFinder::MatchResult &Result); // Called by the preprocessor for each include directive during the non-raw lexing pass. void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token &include_token, diff --git a/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu b/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu new file mode 100644 index 0000000000..8f68bb40c6 --- /dev/null +++ b/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu @@ -0,0 +1,37 @@ +// RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args +// CHECK: #include +#include +// CHECK: #include +#include +// CHECK: #include +#include + +#include + +// using namespace hipcub; +using namespace cub; + +/** +* Simple CUDA kernel for computing tiled partial sums +*/ +template +__global__ void ScanTilesKernel(int *d_in, int *d_out) { + // Specialize collective types for problem context + // TODO: typedef cub::BlockLoad BlockLoadT; + typedef BlockLoad BlockLoadT; + typedef BlockScan BlockScanT; + // Allocate on-chip temporary storage + __shared__ union { + typename BlockLoadT::TempStorage load; + typename BlockScanT::TempStorage reduce; + } temp_storage; + // Load data per thread + int thread_data[ITEMS_PER_THREAD]; + int offset = blockIdx.x * (BLOCK_THREADS * ITEMS_PER_THREAD); + BlockLoadT(temp_storage.load).Load(d_in + offset, offset); + __syncthreads(); + // Compute the block-wide prefix sum + BlockScanT(temp_storage).Sum(thread_data); +} From bb20336fa64bcbcf162db5cdc5b5f33475af0c2a Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Fri, 18 Oct 2019 18:55:52 +0300 Subject: [PATCH 27/45] [HIPIFY][tests] Test clean-up [ROCm/clr commit: 44a897a1464130267c5acc96f0c446f7e6b72100] --- .../tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu b/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu index 8f68bb40c6..9fdbc17515 100644 --- a/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu +++ b/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu @@ -6,14 +6,10 @@ // CHECK: #include #include -#include - // using namespace hipcub; using namespace cub; -/** -* Simple CUDA kernel for computing tiled partial sums -*/ +// Simple CUDA kernel for computing tiled partial sums template From 33acfa17c1a32dc0940b29f95a6bf2a229853eb1 Mon Sep 17 00:00:00 2001 From: Vladislav Sytchenko Date: Fri, 18 Oct 2019 16:40:29 -0400 Subject: [PATCH 28/45] Remove extra #endif. [ROCm/clr commit: 432380aa5da6f02ce6331b7f3b24b3b6d0fec71e] --- projects/clr/hipamd/tests/src/test_common.h | 1 - 1 file changed, 1 deletion(-) diff --git a/projects/clr/hipamd/tests/src/test_common.h b/projects/clr/hipamd/tests/src/test_common.h index dd77cd64a3..67a8e5e60a 100644 --- a/projects/clr/hipamd/tests/src/test_common.h +++ b/projects/clr/hipamd/tests/src/test_common.h @@ -104,7 +104,6 @@ THE SOFTWARE. #define popen(x,y) _popen(x,y) #define pclose(x) _pclose(x) #define setenv(x,y,z) _putenv_s(x,y) -#endif #else #define aligned_free(x) free(x) #endif From e07be75489c4928ffc2789f7556ebccf6d52fdb9 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Sun, 20 Oct 2019 20:03:18 +0300 Subject: [PATCH 29/45] [HIPIFY][tests] Exclude all CUB tests if CUDA_CUB_ROOT_DIR is not set [ROCm/clr commit: 5bf1ff19ff80584ab501185151d1a1f90eaaf825] --- projects/clr/hipamd/tests/hipify-clang/lit.cfg | 2 ++ 1 file changed, 2 insertions(+) diff --git a/projects/clr/hipamd/tests/hipify-clang/lit.cfg b/projects/clr/hipamd/tests/hipify-clang/lit.cfg index 89e17249f0..0a40dda623 100644 --- a/projects/clr/hipamd/tests/hipify-clang/lit.cfg +++ b/projects/clr/hipamd/tests/hipify-clang/lit.cfg @@ -33,6 +33,8 @@ if not config.cuda_dnn_root: warns = True if not config.cuda_cub_root: config.excludes.append('cub_01.cu') + config.excludes.append('cub_02.cu') + config.excludes.append('cub_03.cu') print("WARN: CUB tests are excluded due to unset CUDA_CUB_ROOT_DIR") warns = True if warns: From 3a45daed0a1bf3e99534ab6382b36694c6026cf9 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Sun, 20 Oct 2019 20:08:56 +0300 Subject: [PATCH 30/45] [HIPIFY][tests] Set -I for CUDA path instead of --cuda-path for LLVM < 4 [ROCm/clr commit: ff6057d1ff4147bb7b06f23a5312d1d3b0dd05eb] --- projects/clr/hipamd/tests/hipify-clang/lit.cfg | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/tests/hipify-clang/lit.cfg b/projects/clr/hipamd/tests/hipify-clang/lit.cfg index 0a40dda623..c9c5b83bd3 100644 --- a/projects/clr/hipamd/tests/hipify-clang/lit.cfg +++ b/projects/clr/hipamd/tests/hipify-clang/lit.cfg @@ -124,7 +124,11 @@ elif config.cuda_cub_root: else: config.substitutions.append(("%clang_args", clang_arguments % config.cuda_sdk_root)) -hipify_arguments = "--cuda-path='%s'" +if config.llvm_version_major < 4: + hipify_arguments = "-I'%s'/include" +else: + hipify_arguments = "--cuda-path='%s'" + config.substitutions.append(("%hipify_args", hipify_arguments % config.cuda_root)) config.substitutions.append(("hipify", '"' + hipify_path + "/hipify-clang" + '"')) config.substitutions.append(("%run_test", '"' + config.test_source_root + "/run_test" + run_test_ext + '"')) From 665a200247e4a13d91b49383315b97005aaa44df Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 21 Oct 2019 15:51:25 +0300 Subject: [PATCH 31/45] [HIPIFY][tests] Set max clang's CudaArch for corresponding CUDA version [Reason] To support maximum CUDA features in offline tests + Add CUDA_VERSION >= 800 restriction for atomics.cu [TODO] Find a way to use or exclude atomicAdd for doubles if LLVM < 7, because LLVM 6.0.1 and older do not use --cuda-gpu-arch in clang's Driver code at all (option is only declared) [ROCm/clr commit: 9fc7afa7385895f1461ac67b463fad15a545d516] --- projects/clr/hipamd/tests/hipify-clang/lit.cfg | 11 +++++++++++ .../tests/hipify-clang/unit_tests/device/atomics.cu | 2 ++ 2 files changed, 13 insertions(+) diff --git a/projects/clr/hipamd/tests/hipify-clang/lit.cfg b/projects/clr/hipamd/tests/hipify-clang/lit.cfg index c9c5b83bd3..79be2b3d39 100644 --- a/projects/clr/hipamd/tests/hipify-clang/lit.cfg +++ b/projects/clr/hipamd/tests/hipify-clang/lit.cfg @@ -108,6 +108,17 @@ else: if config.pointer_size == 8: clang_arguments += " -D__LP64__" +# Set max clang's CudaArch for corresponding CUDA version +# to support maximum CUDA features in offline tests +if config.cuda_version_major == 7: + clang_arguments += " --cuda-gpu-arch=sm_52" +if config.cuda_version_major == 8: + clang_arguments += " --cuda-gpu-arch=sm_62" +if config.cuda_version_major == 9: + clang_arguments += " --cuda-gpu-arch=sm_70" +if config.cuda_version_major == 10: + clang_arguments += " --cuda-gpu-arch=sm_75" + # cuDNN ROOT if config.cuda_dnn_root: clang_arguments += " -I'%s'/include" diff --git a/projects/clr/hipamd/tests/hipify-clang/unit_tests/device/atomics.cu b/projects/clr/hipamd/tests/hipify-clang/unit_tests/device/atomics.cu index 1afd1ab541..e24b9a2175 100644 --- a/projects/clr/hipamd/tests/hipify-clang/unit_tests/device/atomics.cu +++ b/projects/clr/hipamd/tests/hipify-clang/unit_tests/device/atomics.cu @@ -276,7 +276,9 @@ int main(int argc, char** argv) { runTest(); runTest(); runTest(); +#if CUDA_VERSION >= 8000 runTest(); +#endif // CHECK: hipDeviceReset(); cudaDeviceReset(); printf("%s completed, returned %s\n", sampleName, testResult ? "OK" : "ERROR!"); From de849a44e711200bf95647ae4bca5047c84334a3 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 21 Oct 2019 17:15:05 +0300 Subject: [PATCH 32/45] [HIPIFY][perl] Support of 'using namespace cub' [ROCm/clr commit: 9633cdbd8ae47570dc4ab0914420763b6389844a] --- projects/clr/hipamd/bin/hipify-perl | 7 +++++++ projects/clr/hipamd/hipify-clang/src/CUDA2HIP_Perl.cpp | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/projects/clr/hipamd/bin/hipify-perl b/projects/clr/hipamd/bin/hipify-perl index 2e391ab8d2..09c8d2a139 100755 --- a/projects/clr/hipamd/bin/hipify-perl +++ b/projects/clr/hipamd/bin/hipify-perl @@ -1676,6 +1676,12 @@ sub transformKernelLaunch { } } +sub transformCubNamespace { + my $k = 0; + $k += s/using\s*namespace\s*cub/using namespace hipcub/g; + return $k; +} + sub transformHostFunctions { my $k = 0; foreach $func ( @@ -2368,6 +2374,7 @@ while (@ARGV) { simpleSubstitutions(); transformExternShared(); transformKernelLaunch(); + transformCubNamespace(); if ($print_stats) { while (/(\b(hip|HIP)([A-Z]|_)\w+\b)/g) { $convertedTags{$1}++; diff --git a/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_Perl.cpp b/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_Perl.cpp index df83bf8d57..8d59089d4f 100644 --- a/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_Perl.cpp +++ b/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_Perl.cpp @@ -251,6 +251,11 @@ namespace perl { *streamPtr.get() << tab_2 << "$Tkernels{$1}++;" << endl_tab << "}" << endl << "}" << endl; } + void generateCubNamespace(unique_ptr& streamPtr) { + *streamPtr.get() << endl << sub << "transformCubNamespace" << " {" << endl_tab << my_k << endl; + *streamPtr.get() << tab << "$k += s/using\\s*namespace\\s*cub/using namespace hipcub/g;" << endl << tab << return_k << "}" << endl; + } + void generateHostFunctions(unique_ptr& streamPtr) { *streamPtr.get() << endl << sub << "transformHostFunctions" << " {" << endl_tab << my_k << endl; set &funcSet = DeviceSymbolFunctions0; @@ -358,6 +363,7 @@ namespace perl { generateSimpleSubstitutions(streamPtr); generateExternShared(streamPtr); generateKernelLaunch(streamPtr); + generateCubNamespace(streamPtr); generateHostFunctions(streamPtr); generateDeviceFunctions(streamPtr); *streamPtr.get() << endl << "# Count of transforms in all files" << endl; @@ -401,6 +407,7 @@ namespace perl { *streamPtr.get() << tab_2 << "simpleSubstitutions();" << endl; *streamPtr.get() << tab_2 << "transformExternShared();" << endl; *streamPtr.get() << tab_2 << "transformKernelLaunch();" << endl; + *streamPtr.get() << tab_2 << "transformCubNamespace();" << endl; *streamPtr.get() << tab_2 << "if ($print_stats) {" << endl; *streamPtr.get() << tab_3 << while_ << "(/(\\b(hip|HIP)([A-Z]|_)\\w+\\b)/g) {" << endl; *streamPtr.get() << tab_4 << "$convertedTags{$1}++;" << endl_tab_3 << "}" << endl_tab_2 << "}" << endl; From e3cf10192c3bfdff55ae4d3076027b72c77da616 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Mon, 21 Oct 2019 17:50:00 +0300 Subject: [PATCH 33/45] [HIPIFY][tests] Set max clang's CudaArch for corresponding CUDA major.minor version [Reason] To support maximum CUDA features in offline tests + Add defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 restriction for atomicAdd on doubles in atomics.cu. So if LLVM < 7 and --cuda-gpu-arch doesn't work, __CUDA_ARCH__ is unset too (350 by default in clang); if LLVM >= 7 --cuda-gpu-arch is used and __CUDA_ARCH__ is set based on it. [ROCm/clr commit: 3233a845f63af019f83843bcce753e3325fd63b4] --- projects/clr/hipamd/tests/hipify-clang/lit.cfg | 16 +++++++++++----- .../hipify-clang/unit_tests/device/atomics.cu | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/projects/clr/hipamd/tests/hipify-clang/lit.cfg b/projects/clr/hipamd/tests/hipify-clang/lit.cfg index 79be2b3d39..ea496e6a69 100644 --- a/projects/clr/hipamd/tests/hipify-clang/lit.cfg +++ b/projects/clr/hipamd/tests/hipify-clang/lit.cfg @@ -111,12 +111,18 @@ if config.pointer_size == 8: # Set max clang's CudaArch for corresponding CUDA version # to support maximum CUDA features in offline tests if config.cuda_version_major == 7: - clang_arguments += " --cuda-gpu-arch=sm_52" -if config.cuda_version_major == 8: + if config.cuda_version_minor == 5: + clang_arguments += " --cuda-gpu-arch=sm_53" + else: + clang_arguments += " --cuda-gpu-arch=sm_52" +elif config.cuda_version_major == 8: clang_arguments += " --cuda-gpu-arch=sm_62" -if config.cuda_version_major == 9: - clang_arguments += " --cuda-gpu-arch=sm_70" -if config.cuda_version_major == 10: +elif config.cuda_version_major == 9: + if config.cuda_version_minor == 2: + clang_arguments += " --cuda-gpu-arch=sm_72" + else: + clang_arguments += " --cuda-gpu-arch=sm_70" +elif config.cuda_version_major == 10: clang_arguments += " --cuda-gpu-arch=sm_75" # cuDNN ROOT diff --git a/projects/clr/hipamd/tests/hipify-clang/unit_tests/device/atomics.cu b/projects/clr/hipamd/tests/hipify-clang/unit_tests/device/atomics.cu index e24b9a2175..3089efe1b8 100644 --- a/projects/clr/hipamd/tests/hipify-clang/unit_tests/device/atomics.cu +++ b/projects/clr/hipamd/tests/hipify-clang/unit_tests/device/atomics.cu @@ -276,7 +276,7 @@ int main(int argc, char** argv) { runTest(); runTest(); runTest(); -#if CUDA_VERSION >= 8000 +#if CUDA_VERSION >= 8000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 runTest(); #endif // CHECK: hipDeviceReset(); From 82222bf9453443db7615f664a0dafc6676dd3329 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 22 Oct 2019 11:08:37 +0300 Subject: [PATCH 34/45] [HIPIFY][#1569] Fix [ROCm/clr commit: e2191e23e6ece223ec14345d98824feb60f14556] --- projects/clr/hipamd/bin/hipify-perl | 1 - .../clr/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp | 2 -- .../clr/hipamd/tests/hipify-clang/unit_tests/samples/vec_add.cu | 1 - 3 files changed, 4 deletions(-) diff --git a/projects/clr/hipamd/bin/hipify-perl b/projects/clr/hipamd/bin/hipify-perl index 09c8d2a139..80f721c58e 100755 --- a/projects/clr/hipamd/bin/hipify-perl +++ b/projects/clr/hipamd/bin/hipify-perl @@ -983,7 +983,6 @@ sub simpleSubstitutions { $ft{'type'} += s/\bcusparsePointerMode_t\b/hipsparsePointerMode_t/g; $ft{'type'} += s/\bcusparseSolvePolicy_t\b/hipsparseSolvePolicy_t/g; $ft{'type'} += s/\bcusparseStatus_t\b/hipsparseStatus_t/g; - $ft{'type'} += s/\bwarpSize\b/hipWarpSize/g; $ft{'numeric_literal'} += s/\bCUBLAS_DIAG_NON_UNIT\b/HIPBLAS_DIAG_NON_UNIT/g; $ft{'numeric_literal'} += s/\bCUBLAS_DIAG_UNIT\b/HIPBLAS_DIAG_UNIT/g; $ft{'numeric_literal'} += s/\bCUBLAS_FILL_MODE_FULL\b/HIPBLAS_FILL_MODE_FULL/g; diff --git a/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp b/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp index fda9b34b2c..7b34d97ab4 100644 --- a/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp +++ b/projects/clr/hipamd/hipify-clang/src/CUDA2HIP_Runtime_API_types.cpp @@ -31,8 +31,6 @@ const std::map CUDA_RUNTIME_TYPE_NAME_MAP { {"cudaChannelFormatDesc", {"hipChannelFormatDesc", "", CONV_TYPE, API_RUNTIME}}, // no analogue {"cudaDeviceProp", {"hipDeviceProp_t", "", CONV_TYPE, API_RUNTIME}}, - // NOTE: int warpSize is a field of cudaDeviceProp - {"warpSize", {"hipWarpSize", "", CONV_TYPE, API_RUNTIME}}, // no analogue {"cudaEglFrame", {"hipEglFrame", "", CONV_TYPE, API_RUNTIME, HIP_UNSUPPORTED}}, diff --git a/projects/clr/hipamd/tests/hipify-clang/unit_tests/samples/vec_add.cu b/projects/clr/hipamd/tests/hipify-clang/unit_tests/samples/vec_add.cu index bc8219bf8c..a6d8950e0b 100644 --- a/projects/clr/hipamd/tests/hipify-clang/unit_tests/samples/vec_add.cu +++ b/projects/clr/hipamd/tests/hipify-clang/unit_tests/samples/vec_add.cu @@ -67,7 +67,6 @@ int devcheck(int gpudevice, int rank) cudaError_t cudareturn; cudaDeviceProp deviceProp; cudaGetDeviceProperties(&deviceProp, gpudevice); - // CHECK: if (deviceProp.hipWarpSize <= 1) if (deviceProp.warpSize <= 1) { printf("rank %d: warning, CUDA Device Emulation (CPU) detected, exiting\n", rank); From 0896e419875ac63047c22293452c16ee4d929d03 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Tue, 22 Oct 2019 19:07:37 +0300 Subject: [PATCH 35/45] [HIPIFY] Disable delayed template parsing By implicit unconditional passing -fno-delayed-template-parsing option (which appeared in LLVM 3.8.0, thus doesn't need compatibility wrapping) to hipify-clang. [Reason] To parse uncalled template functions otherwise they are not parsed without calling, thus not hipified. Affects cub_03.cu test, which has uncalled global template function. [ROCm/clr commit: 7ab06b3892693ee5fc15ddf77b2d0fc8991077f9] --- projects/clr/hipamd/hipify-clang/src/main.cpp | 1 + .../hipify-clang/unit_tests/libraries/CUB/cub_03.cu | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/projects/clr/hipamd/hipify-clang/src/main.cpp b/projects/clr/hipamd/hipify-clang/src/main.cpp index 2214567df3..64037c43dd 100644 --- a/projects/clr/hipamd/hipify-clang/src/main.cpp +++ b/projects/clr/hipamd/hipify-clang/src/main.cpp @@ -199,6 +199,7 @@ int main(int argc, const char **argv) { Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("cuda", ct::ArgumentInsertPosition::BEGIN)); Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-x", ct::ArgumentInsertPosition::BEGIN)); Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN)); + Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-fno-delayed-template-parsing", ct::ArgumentInsertPosition::BEGIN)); if (!CudaPath.empty()) { std::string sCudaPath = "--cuda-path=" + CudaPath; Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster(sCudaPath.c_str(), ct::ArgumentInsertPosition::BEGIN)); diff --git a/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu b/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu index 9fdbc17515..bc914d419d 100644 --- a/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu +++ b/projects/clr/hipamd/tests/hipify-clang/unit_tests/libraries/CUB/cub_03.cu @@ -1,8 +1,6 @@ // RUN: %run_test hipify "%s" "%t" %hipify_args %clang_args // CHECK: #include #include -// CHECK: #include -#include // CHECK: #include #include @@ -11,12 +9,14 @@ using namespace cub; // Simple CUDA kernel for computing tiled partial sums template + // CHECK: hipcub::BlockLoadAlgorithm LOAD_ALGO, + cub::BlockLoadAlgorithm LOAD_ALGO, + // CHECK: hipcub::BlockScanAlgorithm SCAN_ALGO> + cub::BlockScanAlgorithm SCAN_ALGO> __global__ void ScanTilesKernel(int *d_in, int *d_out) { // Specialize collective types for problem context - // TODO: typedef cub::BlockLoad BlockLoadT; - typedef BlockLoad BlockLoadT; + // CHECK: typedef ::hipcub::BlockLoad BlockLoadT; + typedef ::cub::BlockLoad BlockLoadT; typedef BlockScan BlockScanT; // Allocate on-chip temporary storage __shared__ union { From 50d72e13ca4a73564520bdaeb29be293f46114c9 Mon Sep 17 00:00:00 2001 From: Evgeny Mankov Date: Wed, 23 Oct 2019 18:54:45 +0300 Subject: [PATCH 36/45] [HIPIFY][cmake][#1571] Take into account building hipify-clang as a part of building HIP while installing [Algorithm] [Release] If CMAKE_INSTALL_PREFIX is set by the user: If BIN_INSTALL_DIR is set by HIP, use it as CMAKE_INSTALL_PREFIX, otherwise CMAKE_INSTALL_PREFIX is used unchanged. If the user does not set CMAKE_INSTALL_PREFIX (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT): If BIN_INSTALL_DIR is set by HIP, use it as CMAKE_INSTALL_PREFIX, otherwise use PROJECT_BINARY_DIR/bin for installation. [Debug] If CMAKE_INSTALL_PREFIX is set by the user: CMAKE_INSTALL_PREFIX is used unchanged. If the user does not set CMAKE_INSTALL_PREFIX (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT): use CMAKE_CURRENT_SOURCE_DIR/bin for installation. Standalone build left unchanged: CMAKE_INSTALL_PREFIX is used if set. [ROCm/clr commit: 2435567e70a4112d281a31ac1c33f8a8b20d63fd] --- .../clr/hipamd/hipify-clang/CMakeLists.txt | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/hipify-clang/CMakeLists.txt b/projects/clr/hipamd/hipify-clang/CMakeLists.txt index 25076a8895..2f24c6c6fe 100644 --- a/projects/clr/hipamd/hipify-clang/CMakeLists.txt +++ b/projects/clr/hipamd/hipify-clang/CMakeLists.txt @@ -80,11 +80,28 @@ endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${EXTRA_CFLAGS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${EXTRA_CFLAGS} ${StdCpp} -DHIPIFY_CLANG_RES=\\\"${LLVM_LIBRARY_DIRS}/clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}\\\"") +set(INSTALL_PATH_DOC_STRING "Installation path for hipify-clang") +if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + if(CMAKE_BUILD_TYPE MATCHES Debug) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) + elseif(CMAKE_BUILD_TYPE MATCHES Release) + if (BIN_INSTALL_DIR) + set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) + else() + set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/bin" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) + endif() + else() + message(FATAL_ERROR "Invalid CMAKE_BUILD_TYPE specified. Valid values are Debug and Release") + endif() +elseif(BIN_INSTALL_DIR) + set(CMAKE_INSTALL_PREFIX "${BIN_INSTALL_DIR}" CACHE PATH ${INSTALL_PATH_DOC_STRING} FORCE) +endif() + install(TARGETS hipify-clang DESTINATION ${CMAKE_INSTALL_PREFIX}) install( DIRECTORY ${LLVM_DIR}/../../clang/${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}.${LLVM_VERSION_PATCH}/ - DESTINATION ${CMAKE_INSTALL_PREFIX} + DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT clang-resource-headers FILES_MATCHING PATTERN "*.h" From 9e0eaef84659a105aef5c78d5c8dab5a90967990 Mon Sep 17 00:00:00 2001 From: Aryan Salmanpour Date: Thu, 24 Oct 2019 08:13:30 -0400 Subject: [PATCH 37/45] [hip] add support for implicit kernel argument for multi-grid sync (#1456) * [hip] add support for implicit kernel argument for multi-grid sync * modified code for calculating the prev_sum * change the impCoopArg type to size_t * add memory clean up * launch init_gws and main kernels into two separate loops [ROCm/clr commit: 93c688a0c946bf08c1629e9b77e2a406b56aabdc] --- projects/clr/hipamd/src/hip_hcc_internal.h | 21 +++ projects/clr/hipamd/src/hip_memory.cpp | 194 +++++++++++---------- projects/clr/hipamd/src/hip_module.cpp | 95 +++++++++- 3 files changed, 211 insertions(+), 99 deletions(-) diff --git a/projects/clr/hipamd/src/hip_hcc_internal.h b/projects/clr/hipamd/src/hip_hcc_internal.h index 3ee14577b0..658cfbf576 100644 --- a/projects/clr/hipamd/src/hip_hcc_internal.h +++ b/projects/clr/hipamd/src/hip_hcc_internal.h @@ -1022,6 +1022,27 @@ inline std::ostream& operator<<(std::ostream& os, const ihipCtx_t* c) { namespace hip_internal { hipError_t memcpyAsync(void* dst, const void* src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream); + +hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags); + +hipError_t ihipHostFree(TlsData *tls, void* ptr); + +}; + +#define MAX_COOPERATIVE_GPUs 255 + +// do not change these two structs without changing the device library +struct mg_sync { + uint w0; + uint w1; +}; + +struct mg_info { + struct mg_sync *mgs; + uint grid_id; + uint num_grids; + ulong prev_sum; + ulong all_sum; }; //--- diff --git a/projects/clr/hipamd/src/hip_memory.cpp b/projects/clr/hipamd/src/hip_memory.cpp index 48b83287f3..c8369685ec 100644 --- a/projects/clr/hipamd/src/hip_memory.cpp +++ b/projects/clr/hipamd/src/hip_memory.cpp @@ -141,6 +141,103 @@ void* allocAndSharePtr(const char* msg, size_t sizeBytes, ihipCtx_t* ctx, bool s return ptr; } +hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) { + hipError_t hip_status = hipSuccess; + + if (HIP_SYNC_HOST_ALLOC) { + hipDeviceSynchronize(); + } + + auto ctx = ihipGetTlsDefaultCtx(); + if ((ctx == nullptr) || (ptr == nullptr)) { + hip_status = hipErrorInvalidValue; + } + else if (sizeBytes == 0) { + hip_status = hipSuccess; + // TODO - should size of 0 return err or be siliently ignored? + } else { + unsigned trueFlags = flags; + if (flags == hipHostMallocDefault) { + // HCC/ROCM provide a modern system with unified memory and should set both of these + // flags by default: + trueFlags = hipHostMallocMapped | hipHostMallocPortable; + } + + + const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | + hipHostMallocWriteCombined | hipHostMallocCoherent | + hipHostMallocNonCoherent; + + + const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; + + if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) { + *ptr = nullptr; + // can't specify unsupported flags, can't specify both Coherent + NonCoherent + hip_status = hipErrorInvalidValue; + } else { + auto device = ctx->getWriteableDevice(); +#if (__hcc_workweek__ >= 19115) + //Avoid mapping host pinned memory to all devices by HCC + unsigned amFlags = amHostUnmapped; +#else + unsigned amFlags = 0; +#endif + if (flags & hipHostMallocCoherent) { + amFlags |= amHostCoherent; + } else if (flags & hipHostMallocNonCoherent) { + amFlags |= amHostNonCoherent; + } else { + // depends on env variables: + amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent; + } + + + *ptr = hip_internal::allocAndSharePtr( + (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx, + true /*shareWithAll*/, amFlags, flags, 0); + + if (sizeBytes && (*ptr == NULL)) { + hip_status = hipErrorMemoryAllocation; + } + } + } + + if (HIP_SYNC_HOST_ALLOC) { + hipDeviceSynchronize(); + } + return hip_status; +} + +hipError_t ihipHostFree(TlsData *tls, void* ptr) { + + // Synchronize to ensure all work has finished. + ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits + // for all activity to finish. + + hipError_t hipStatus = hipErrorInvalidValue; + if (ptr) { + hc::accelerator acc; +#if (__hcc_workweek__ >= 17332) + hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); +#else + hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); +#endif + am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); + if (status == AM_SUCCESS) { + if (amPointerInfo._hostPointer == ptr) { + hc::am_free(ptr); + hipStatus = hipSuccess; + } + } + } else { + // free NULL pointer succeeds and is common technique to initialize runtime + hipStatus = hipSuccess; + } + + return hipStatus; +} + } // end namespace hip_internal @@ -301,79 +398,12 @@ hipError_t hipExtMallocWithFlags(void** ptr, size_t sizeBytes, unsigned int flag return ihipLogStatus(hip_status); } -hipError_t ihipHostMalloc(TlsData *tls, void** ptr, size_t sizeBytes, unsigned int flags) { - hipError_t hip_status = hipSuccess; - - if (HIP_SYNC_HOST_ALLOC) { - hipDeviceSynchronize(); - } - - auto ctx = ihipGetTlsDefaultCtx(); - if ((ctx == nullptr) || (ptr == nullptr)) { - hip_status = hipErrorInvalidValue; - } - else if (sizeBytes == 0) { - hip_status = hipSuccess; - // TODO - should size of 0 return err or be siliently ignored? - } else { - unsigned trueFlags = flags; - if (flags == hipHostMallocDefault) { - // HCC/ROCM provide a modern system with unified memory and should set both of these - // flags by default: - trueFlags = hipHostMallocMapped | hipHostMallocPortable; - } - - - const unsigned supportedFlags = hipHostMallocPortable | hipHostMallocMapped | - hipHostMallocWriteCombined | hipHostMallocCoherent | - hipHostMallocNonCoherent; - - - const unsigned coherencyFlags = hipHostMallocCoherent | hipHostMallocNonCoherent; - - if ((flags & ~supportedFlags) || ((flags & coherencyFlags) == coherencyFlags)) { - *ptr = nullptr; - // can't specify unsupported flags, can't specify both Coherent + NonCoherent - hip_status = hipErrorInvalidValue; - } else { - auto device = ctx->getWriteableDevice(); -#if (__hcc_workweek__ >= 19115) - //Avoid mapping host pinned memory to all devices by HCC - unsigned amFlags = amHostUnmapped; -#else - unsigned amFlags = 0; -#endif - if (flags & hipHostMallocCoherent) { - amFlags |= amHostCoherent; - } else if (flags & hipHostMallocNonCoherent) { - amFlags |= amHostNonCoherent; - } else { - // depends on env variables: - amFlags |= HIP_HOST_COHERENT ? amHostCoherent : amHostNonCoherent; - } - - - *ptr = hip_internal::allocAndSharePtr( - (amFlags & amHostCoherent) ? "finegrained_host" : "pinned_host", sizeBytes, ctx, - true /*shareWithAll*/, amFlags, flags, 0); - - if (sizeBytes && (*ptr == NULL)) { - hip_status = hipErrorMemoryAllocation; - } - } - } - - if (HIP_SYNC_HOST_ALLOC) { - hipDeviceSynchronize(); - } - return hip_status; -} hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) { HIP_INIT_SPECIAL_API(hipHostMalloc, (TRACE_MEM), ptr, sizeBytes, flags); HIP_SET_DEVICE(); hipError_t hip_status = hipSuccess; - hip_status = ihipHostMalloc(tls, ptr, sizeBytes, flags); + hip_status = hip_internal::ihipHostMalloc(tls, ptr, sizeBytes, flags); return ihipLogStatus(hip_status); } @@ -384,7 +414,7 @@ hipError_t hipMallocManaged(void** devPtr, size_t size, unsigned int flags) { if(flags != hipMemAttachGlobal) hip_status = hipErrorInvalidValue; else - hip_status = ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault); + hip_status = hip_internal::ihipHostMalloc(tls, devPtr, size, hipHostMallocDefault); return ihipLogStatus(hip_status); } @@ -2146,30 +2176,8 @@ hipError_t hipFree(void* ptr) { hipError_t hipHostFree(void* ptr) { HIP_INIT_SPECIAL_API(hipHostFree, (TRACE_MEM), ptr); - // Synchronize to ensure all work has finished. - ihipGetTlsDefaultCtx()->locked_waitAllStreams(); // ignores non-blocking streams, this waits - // for all activity to finish. - - - hipError_t hipStatus = hipErrorInvalidValue; - if (ptr) { - hc::accelerator acc; -#if (__hcc_workweek__ >= 17332) - hc::AmPointerInfo amPointerInfo(NULL, NULL, NULL, 0, acc, 0, 0); -#else - hc::AmPointerInfo amPointerInfo(NULL, NULL, 0, acc, 0, 0); -#endif - am_status_t status = hc::am_memtracker_getinfo(&amPointerInfo, ptr); - if (status == AM_SUCCESS) { - if (amPointerInfo._hostPointer == ptr) { - hc::am_free(ptr); - hipStatus = hipSuccess; - } - } - } else { - // free NULL pointer succeeds and is common technique to initialize runtime - hipStatus = hipSuccess; - } + hipError_t hipStatus = hipSuccess; + hipStatus = hip_internal::ihipHostFree(tls, ptr); return ihipLogStatus(hipStatus); }; diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index b3afdd4ffe..ac239105b8 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -137,7 +137,8 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ, size_t sharedMemBytes, hipStream_t hStream, void** kernelParams, void** extra, - hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0) { + hipEvent_t startEvent, hipEvent_t stopEvent, uint32_t flags, bool isStreamLocked = 0, + void** impCoopParams = 0) { using namespace hip_impl; auto ctx = ihipGetTlsDefaultCtx(); @@ -181,10 +182,17 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global return hipErrorInvalidValue; } - // Insert 48-bytes at the end for implicit kernel arguments and fill with value zero. + // Insert 56-bytes at the end for implicit kernel arguments and fill with value zero. size_t padSize = (~kernargs.size() + 1) & (HIP_IMPLICIT_KERNARG_ALIGNMENT - 1); kernargs.insert(kernargs.end(), padSize + HIP_IMPLICIT_KERNARG_SIZE, 0); + if (impCoopParams) { + const auto p{static_cast(*impCoopParams)}; + // The sixth index is for multi-grid synchronization + kernargs.insert((kernargs.cend() - padSize - HIP_IMPLICIT_KERNARG_SIZE) + 6 * HIP_IMPLICIT_KERNARG_ALIGNMENT, + p, p + HIP_IMPLICIT_KERNARG_ALIGNMENT); + } + /* Kernel argument preparation. */ @@ -449,6 +457,10 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, return ihipLogStatus(hipErrorLaunchFailure); } + size_t impCoopArg = 1; + void* impCoopParams[1]; + impCoopParams[0] = &impCoopArg; + // launch the main kernel result = ihipModuleLaunchKernel(tls, kd, gridDim.x * blockDimX.x, @@ -456,7 +468,7 @@ hipError_t hipLaunchCooperativeKernel(const void* f, dim3 gridDim, gridDim.z * blockDimX.z, blockDimX.x, blockDimX.y, blockDimX.z, sharedMemBytes, stream, kernelParams, nullptr, nullptr, - nullptr, 0, true); + nullptr, 0, true, impCoopParams); stream->criticalData().unlock(); #if (__hcc_workweek__ >= 19213) @@ -472,7 +484,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi HIP_INIT_API(hipLaunchCooperativeKernelMultiDevice, launchParamsList, numDevices, flags); hipError_t result; - if (numDevices > g_deviceCnt || launchParamsList == nullptr) { + if (numDevices > g_deviceCnt || launchParamsList == nullptr || numDevices > MAX_COOPERATIVE_GPUs) { return ihipLogStatus(hipErrorInvalidValue); } @@ -523,6 +535,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi kargs.getHandle()); } + mg_sync *mg_sync_ptr = 0; + mg_info *mg_info_ptr[MAX_COOPERATIVE_GPUs] = {0}; + + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_sync_ptr, sizeof(mg_sync), hipHostMallocDefault); + if (result != hipSuccess) { + return ihipLogStatus(hipErrorInvalidValue); + } + mg_sync_ptr->w0 = 0; + mg_sync_ptr->w1 = 0; + + uint all_sum = 0; + for (int i = 0; i < numDevices; ++i) { + result = hip_internal::ihipHostMalloc(tls, (void **)&mg_info_ptr[i], sizeof(mg_info), hipHostMallocDefault); + if (result != hipSuccess) { + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < i; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(hipErrorInvalidValue); + } + // calculate the sum of sizes of all grids + const hipLaunchParams& lp = launchParamsList[i]; + all_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * + lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; + } + // lock all streams before launching the blit kernels for initializing the GWS and main kernels to each device for (int i = 0; i < numDevices; ++i) { LockedAccessor_StreamCrit_t streamCrit(launchParamsList[i].stream->criticalData(), false); @@ -531,7 +569,7 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi #endif } - // launch the init_gws kernel to initialize the GWS followed by launching the main kernels for each device + // launch the init_gws kernel to initialize the GWS for each device for (int i = 0; i < numDevices; ++i) { const hipLaunchParams& lp = launchParamsList[i]; @@ -549,8 +587,32 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); #endif } + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(hipErrorLaunchFailure); } + } + + void* impCoopParams[1]; + ulong prev_sum = 0; + // launch the main kernels for each device + for (int i = 0; i < numDevices; ++i) { + const hipLaunchParams& lp = launchParamsList[i]; + + //initialize and setup the implicit kernel argument for multi-grid sync + mg_info_ptr[i]->mgs = mg_sync_ptr; + mg_info_ptr[i]->grid_id = i; + mg_info_ptr[i]->num_grids = numDevices; + mg_info_ptr[i]->all_sum = all_sum; + mg_info_ptr[i]->prev_sum = prev_sum; + prev_sum += lp.blockDim.x * lp.blockDim.y * lp.blockDim.z * + lp.gridDim.x * lp.gridDim.y * lp.gridDim.z; + + + impCoopParams[0] = &mg_info_ptr[i]; result = ihipModuleLaunchKernel(tls, kds[i], lp.gridDim.x * lp.blockDim.x, @@ -559,7 +621,23 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi lp.blockDim.x, lp.blockDim.y, lp.blockDim.z, lp.sharedMem, lp.stream, lp.args, nullptr, nullptr, nullptr, 0, - true); + true, impCoopParams); + + if (result != hipSuccess) { + for (int j = 0; j < numDevices; ++j) { + launchParamsList[j].stream->criticalData().unlock(); +#if (__hcc_workweek__ >= 19213) + launchParamsList[j].stream->criticalData()._av.release_locked_hsa_queue(); +#endif + } + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + + return ihipLogStatus(hipErrorLaunchFailure); + } + } // unlock all streams @@ -573,6 +651,11 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi free(gwsKds); free(kds); + hip_internal::ihipHostFree(tls, mg_sync_ptr); + for (int j = 0; j < numDevices; ++j) { + hip_internal::ihipHostFree(tls, mg_info_ptr[j]); + } + return ihipLogStatus(result); } From fb411b56c294bb42b9fc63e0bce014d2a9095ff3 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 24 Oct 2019 13:13:49 +0100 Subject: [PATCH 38/45] Improve scalar access into vector types. (#1531) The improvement is based on the ideas here: https://t0rakka.silvrback.com/simd-scalar-accessor. It yields significantly better ISA when the base's .xyzw members are used. [ROCm/clr commit: 84d5b399f642263f9ba99e5df97e0addc86a3d7c] --- .../include/hip/hcc_detail/hip_vector_types.h | 109 +++++++++++++++--- 1 file changed, 96 insertions(+), 13 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h index a03a46b8cf..f80745038a 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_vector_types.h @@ -47,6 +47,95 @@ THE SOFTWARE. #if defined(__cplusplus) #include + namespace hip_impl { + template + struct Scalar_accessor { + // Idea from https://t0rakka.silvrback.com/simd-scalar-accessor + Vector data; + + __host__ __device__ + operator T() const noexcept { return data[idx]; } + + __host__ __device__ + Scalar_accessor& operator=(T x) noexcept { + data[idx] = x; + + return *this; + } + + __host__ __device__ + Scalar_accessor& operator+=(T x) noexcept { + data[idx] += x; + return *this; + } + __host__ __device__ + Scalar_accessor& operator-=(T x) noexcept { + data[idx] -= x; + return *this; + } + + __host__ __device__ + Scalar_accessor& operator*=(T x) noexcept { + data[idx] *= x; + return *this; + } + __host__ __device__ + Scalar_accessor& operator/=(T x) noexcept { + data[idx] /= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator%=(T x) noexcept { + data[idx] %= x; + return *this; + } + + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator>>=(T x) noexcept { + data[idx] >>= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator<<=(T x) noexcept { + data[idx] <<= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator&=(T x) noexcept { + data[idx] &= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator|=(T x) noexcept { + data[idx] |= x; + return *this; + } + template< + typename U = T, + typename std::enable_if{}>::type* = nullptr> + __host__ __device__ + Scalar_accessor& operator^=(T x) noexcept { + data[idx] ^= x; + return *this; + } + }; + } // Namespace hip_impl. + template struct HIP_vector_base; template @@ -55,9 +144,7 @@ THE SOFTWARE. union { Native_vec_ data; - struct { - T x; - }; + hip_impl::Scalar_accessor x; }; }; @@ -67,10 +154,8 @@ THE SOFTWARE. union { Native_vec_ data; - struct { - T x; - T y; - }; + hip_impl::Scalar_accessor x; + hip_impl::Scalar_accessor y; }; }; @@ -238,12 +323,10 @@ THE SOFTWARE. union { Native_vec_ data; - struct { - T x; - T y; - T z; - T w; - }; + hip_impl::Scalar_accessor x; + hip_impl::Scalar_accessor y; + hip_impl::Scalar_accessor z; + hip_impl::Scalar_accessor w; }; }; From 510be4b5dc21d1519f468760ab4c010e06d359db Mon Sep 17 00:00:00 2001 From: searlmc1 Date: Thu, 24 Oct 2019 05:14:05 -0700 Subject: [PATCH 39/45] Improve performance of v2 arg handling (#1539) * Improve performance of v2 arg handling * Missing change to `std::string` [ROCm/clr commit: 15a699688e2dba2b0351720c3e81ae8b669cd001] --- .../hip/hcc_detail/code_object_bundle.hpp | 2 +- projects/clr/hipamd/src/hip_module.cpp | 33 ++- projects/clr/hipamd/src/program_state.inl | 192 +++++++++++++----- 3 files changed, 150 insertions(+), 77 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/code_object_bundle.hpp b/projects/clr/hipamd/include/hip/hcc_detail/code_object_bundle.hpp index 32b0c0dbc8..f312d2e79b 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/code_object_bundle.hpp +++ b/projects/clr/hipamd/include/hip/hcc_detail/code_object_bundle.hpp @@ -86,7 +86,7 @@ struct Bundled_code { char cbuf[sizeof(offset) + sizeof(bundle_sz) + sizeof(triple_sz)]; } header; std::string triple; - std::vector blob; + std::string blob; }; #define magic_string_ "__CLANG_OFFLOAD_BUNDLE__" diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index ac239105b8..2afbabf0a8 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -109,6 +109,7 @@ struct ihipModuleSymbol_t { amd_kernel_code_t const* _header{}; string _name; // TODO - review for performance cost. Name is just used for debug. vector> _kernarg_layout{}; + bool _is_code_object_v3{}; }; template <> @@ -216,8 +217,7 @@ hipError_t ihipModuleLaunchKernel(TlsData *tls, hipFunction_t f, uint32_t global aql.grid_size_x = globalWorkSizeX; aql.grid_size_y = globalWorkSizeY; aql.grid_size_z = globalWorkSizeZ; - bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos; - if (is_code_object_v3) { + if (f->_is_code_object_v3) { const auto* header = reinterpret_cast(f->_header); aql.group_segment_size = @@ -1060,31 +1060,24 @@ hipFuncAttributes make_function_attributes(TlsData *tls, const ihipModuleSymbol_ // available per CU, therefore we hardcode it to 64 KiRegisters. prop.regsPerBlock = prop.regsPerBlock ? prop.regsPerBlock : 64 * 1024; - bool is_code_object_v3 = kd._name.find(".kd") != std::string::npos; - if (is_code_object_v3) { + if (kd._is_code_object_v3) { r.localSizeBytes = header_v3(kd)->private_segment_fixed_size; r.sharedSizeBytes = header_v3(kd)->group_segment_fixed_size; - } else { - r.localSizeBytes = kd._header->workitem_private_segment_byte_size; - r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size; - } - r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes; - if (is_code_object_v3) { r.numRegs = ((header_v3(kd)->compute_pgm_rsrc1 & 0x3F) + 1) << 2; - } else { - r.numRegs = kd._header->workitem_vgpr_count; - } - r.maxThreadsPerBlock = r.numRegs ? - std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) : - prop.maxThreadsPerBlock; - if (is_code_object_v3) { r.binaryVersion = 0; // FIXME: should it be the ISA version or code // object format version? } else { + r.localSizeBytes = kd._header->workitem_private_segment_byte_size; + r.sharedSizeBytes = kd._header->workgroup_group_segment_byte_size; + r.numRegs = kd._header->workitem_vgpr_count; r.binaryVersion = kd._header->amd_machine_version_major * 10 + kd._header->amd_machine_version_minor; } + r.maxDynamicSharedSizeBytes = prop.sharedMemPerBlock - r.sharedSizeBytes; + r.maxThreadsPerBlock = r.numRegs ? + std::min(prop.maxThreadsPerBlock, prop.regsPerBlock / r.numRegs) : + prop.maxThreadsPerBlock; r.ptxVersion = prop.major * 10 + prop.minor; // HIP currently presents itself as PTX 3.0. return r; @@ -1182,8 +1175,7 @@ hipError_t ihipModuleLoadData(TlsData *tls, hipModule_t* module, const void* ima content.data(), content.size(), (*module)->executable, this_agent()); - std::vector blob(content.cbegin(), content.cend()); - program_state_impl::read_kernarg_metadata(blob, (*module)->kernargs); + program_state_impl::read_kernarg_metadata(content, (*module)->kernargs); // compute the hash of the code object (*module)->hash = checksum(content.length(), content.data()); @@ -1235,8 +1227,7 @@ hipError_t hipModuleGetTexRef(textureReference** texRef, hipModule_t hmod, const void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size_t* usedLDS) { - bool is_code_object_v3 = f->_name.find(".kd") != std::string::npos; - if (is_code_object_v3) { + if (f->_is_code_object_v3) { const auto header = reinterpret_cast(f->_header); // GRANULATED_WAVEFRONT_VGPR_COUNT is specified in 0:5 bits of COMPUTE_PGM_RSRC1 // the granularity for gfx6-gfx9 is max(0, ceil(vgprs_used / 4) - 1) diff --git a/projects/clr/hipamd/src/program_state.inl b/projects/clr/hipamd/src/program_state.inl index 4f05d2763e..9feabbc2f7 100644 --- a/projects/clr/hipamd/src/program_state.inl +++ b/projects/clr/hipamd/src/program_state.inl @@ -89,9 +89,10 @@ struct Symbol { class Kernel_descriptor { std::uint64_t kernel_object_{}; - amd_kernel_code_t const* kernel_header_{nullptr}; - std::string name_{}; + amd_kernel_code_t const* header_{}; + std::string name_; std::vector> kernarg_layout_{}; + bool is_code_object_v3_{}; public: Kernel_descriptor() = default; Kernel_descriptor( @@ -101,7 +102,8 @@ public: : kernel_object_{kernel_object}, name_{name}, - kernarg_layout_{std::move(kernarg_layout)} + kernarg_layout_{std::move(kernarg_layout)}, + is_code_object_v3_{name.find(".kd") != std::string::npos} { bool supported{false}; std::uint16_t min_v{UINT16_MAX}; @@ -123,7 +125,7 @@ public: r = tbl.hsa_ven_amd_loader_query_host_address( reinterpret_cast(kernel_object_), - reinterpret_cast(&kernel_header_)); + reinterpret_cast(&header_)); if (r != HSA_STATUS_SUCCESS) return; } @@ -149,7 +151,7 @@ public: std::string, std::unordered_map< hsa_isa_t, - std::vector>>>> code_object_blobs; + std::vector>>> code_object_blobs; std::pair< std::once_flag, @@ -213,7 +215,7 @@ public: std::string, std::unordered_map< hsa_isa_t, - std::vector>>>& get_code_object_blobs() { + std::vector>>& get_code_object_blobs() { std::call_once(code_object_blobs.first, [this]() { dl_iterate_phdr([](dl_phdr_info* info, std::size_t, void* p) { @@ -584,6 +586,68 @@ public: return functions[agent].second; } + static + std::size_t parse_args_v2( + const std::string& metadata, + std::size_t f, + std::size_t l, + std::vector>& size_align) { + if (f == l) return f; + if (!size_align.empty()) return l; + + do { + static constexpr size_t size_sz{5}; + f = metadata.find("Size:", f) + size_sz; + + if (l <= f) return f; + + auto size = std::strtoul(&metadata[f], nullptr, 10); + + static constexpr size_t align_sz{6}; + f = metadata.find("Align:", f) + align_sz; + + char* l{}; + auto align = std::strtoul(&metadata[f], &l, 10); + + f += (l - &metadata[f]) + 1; + + size_align.emplace_back(size, align); + } while (true); + } + + static + void read_kernarg_metadata_v2( + const std::string& kernels_md, + std::size_t dx, + std::unordered_map< + std::string, + std::vector>>& kernargs) { + do { + dx = kernels_md.find("Name:", dx); + + if (dx == std::string::npos) break; + + static constexpr decltype(kernels_md.size()) name_sz{5}; + dx = kernels_md.find_first_not_of(" '", dx + name_sz); + + auto fn = + kernels_md.substr(dx, kernels_md.find_first_of("'\n", dx) - dx); + dx += fn.size(); + + auto dx1 = kernels_md.find("CodeProps", dx); + dx = kernels_md.find("Args:", dx); + + if (dx1 < dx) { + dx = dx1; + continue; + } + if (dx == std::string::npos) break; + + static constexpr decltype(kernels_md.size()) args_sz{5}; + dx = parse_args_v2(kernels_md, dx + args_sz, dx1, kernargs[fn]); + } while (true); + } + static std::string metadata_to_string(const amd_comgr_metadata_node_t& md) { std::string str; @@ -598,9 +662,8 @@ public: } static - void parse_args( + void parse_args_v3( const amd_comgr_metadata_node_t& args_md, - bool is_code_object_v3, std::vector>& size_align) { size_t arg_count = 0; if (amd_comgr_get_metadata_list_size(args_md, &arg_count) @@ -615,9 +678,7 @@ public: return; amd_comgr_metadata_node_t arg_size_md; - if (amd_comgr_metadata_lookup(arg_md, - is_code_object_v3 ? ".size" : "Size", - &arg_size_md) + if (amd_comgr_metadata_lookup(arg_md, ".size", &arg_size_md) != AMD_COMGR_STATUS_SUCCESS) return; @@ -629,35 +690,21 @@ public: size_t arg_align; - if (is_code_object_v3) { - amd_comgr_metadata_node_t arg_offset_md; - if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md) - != AMD_COMGR_STATUS_SUCCESS) - return; + amd_comgr_metadata_node_t arg_offset_md; + if (amd_comgr_metadata_lookup(arg_md, ".offset", &arg_offset_md) + != AMD_COMGR_STATUS_SUCCESS) + return; - size_t arg_offset - = std::stoul(metadata_to_string(arg_offset_md)); + size_t arg_offset = std::stoul(metadata_to_string(arg_offset_md)); - if (amd_comgr_destroy_metadata(arg_offset_md) - != AMD_COMGR_STATUS_SUCCESS) - return; + if (amd_comgr_destroy_metadata(arg_offset_md) + != AMD_COMGR_STATUS_SUCCESS) + return; - arg_align = 1; - while (arg_offset && (arg_offset & 1) == 0) { - arg_offset >>= 1; - arg_align <<= 1; - } - } else { - amd_comgr_metadata_node_t arg_align_md; - if (amd_comgr_metadata_lookup(arg_md, "Align", &arg_align_md) - != AMD_COMGR_STATUS_SUCCESS) - return; - - arg_align = std::stoul(metadata_to_string(arg_align_md)); - - if (amd_comgr_destroy_metadata(arg_align_md) - != AMD_COMGR_STATUS_SUCCESS) - return; + arg_align = 1; + while (arg_offset && (arg_offset & 1) == 0) { + arg_offset >>= 1; + arg_align <<= 1; } size_align.emplace_back(arg_size, arg_align); @@ -669,11 +716,11 @@ public: } static - void read_kernarg_metadata( - const std::vector& blob, + void read_kernarg_metadata_v3( + const std::string& blob, std::unordered_map< - std::string, - std::vector>>& kernargs) { + std::string, + std::vector>>& kernargs) { amd_comgr_data_t dataIn; amd_comgr_status_t status; @@ -690,7 +737,6 @@ public: != AMD_COMGR_STATUS_SUCCESS) return; - bool is_code_object_v3 = false; amd_comgr_metadata_node_t kernels_md; if (amd_comgr_metadata_lookup(metadata, "Kernels", &kernels_md) != AMD_COMGR_STATUS_SUCCESS) { @@ -699,7 +745,6 @@ public: &kernels_md) != AMD_COMGR_STATUS_SUCCESS) return; - is_code_object_v3 = true; } size_t kernel_count = 0; @@ -715,9 +760,7 @@ public: continue; amd_comgr_metadata_node_t name_md; - if (amd_comgr_metadata_lookup(kernel_md, - is_code_object_v3 ? ".name" : "Name", - &name_md) + if (amd_comgr_metadata_lookup(kernel_md, ".name", &name_md) != AMD_COMGR_STATUS_SUCCESS) continue; @@ -727,21 +770,15 @@ public: != AMD_COMGR_STATUS_SUCCESS) continue; - if (is_code_object_v3) - kernel_name_str.append(".kd"); - - amd_comgr_metadata_node_t args_md; - if (amd_comgr_metadata_lookup(kernel_md, - is_code_object_v3 ? ".args" : "Args", - &args_md) + if (amd_comgr_metadata_lookup(kernel_md, ".args", &args_md) != AMD_COMGR_STATUS_SUCCESS) continue; auto foundKernel = kernargs.find(kernel_name_str); // parse arguments for a given kernel only once if (foundKernel == kernargs.end()) { - parse_args(args_md, is_code_object_v3, kernargs[kernel_name_str]); + parse_args_v3(args_md, kernargs[kernel_name_str]); } if (amd_comgr_destroy_metadata(args_md) != AMD_COMGR_STATUS_SUCCESS @@ -757,7 +794,52 @@ public: amd_comgr_release_data(dataIn); } - const std::unordered_map>>& kernargs) + { + std::istringstream istr{blob}; + ELFIO::elfio reader; + + if (!reader.load(istr)) return; + + // TODO: this is inefficient. + auto it = find_section_if(reader, [](const ELFIO::section* x) { + return x->get_type() == SHT_NOTE; + }); + + if (!it) return; + + const ELFIO::note_section_accessor acc{reader, it}; + auto n{acc.get_notes_num()}; + while (n--) { + ELFIO::Elf_Word type{}; + std::string name{}; + void* desc{}; + ELFIO::Elf_Word desc_size{}; + + acc.get_note(n, type, name, desc, desc_size); + + if (name == "AMDGPU") { + return read_kernarg_metadata_v3(blob, kernargs); + } + if (name != "AMD") continue; // TODO: switch to using NT_AMD_AMDGPU_HSA_METADATA. + + std::string tmp{ + static_cast(desc), static_cast(desc) + desc_size}; + + auto dx = tmp.find("Kernels:"); + + if (dx == std::string::npos) continue; + + return read_kernarg_metadata_v2(tmp, dx + 8u, kernargs); // Skip "Kernels:". + } + } + + const std::unordered_map>>& get_kernargs() { std::call_once(kernargs.first, [this]() { From ad1e409a248a07e40b6afdfe1c6c9742ec14d0b4 Mon Sep 17 00:00:00 2001 From: satyanveshd <53337087+satyanveshd@users.noreply.github.com> Date: Thu, 24 Oct 2019 17:44:47 +0530 Subject: [PATCH 40/45] Fix occupany APIs (#1560) Addresses SWDEV-205006 [ROCm/clr commit: 6c5fbf9b4a52383179d44b826e72e65f466ac824] --- .../hip/hcc_detail/functional_grid_launch.hpp | 29 ------------ .../include/hip/hcc_detail/hip_runtime_api.h | 36 +++++++++++---- .../2_Cookbook/13_occupancy/occupancy.cpp | 8 ++-- projects/clr/hipamd/src/hip_module.cpp | 45 ++++++++++--------- .../module/hipLaunchCooperativeKernel.cpp | 2 +- ...upancyMaxActiveBlocksPerMultiprocessor.cpp | 21 ++------- .../hipOccupancyMaxPotentialBlockSize.cpp | 20 +-------- 7 files changed, 63 insertions(+), 98 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp b/projects/clr/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp index c493eec933..da57240ebd 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp +++ b/projects/clr/hipamd/include/hip/hcc_detail/functional_grid_launch.hpp @@ -127,35 +127,6 @@ void hipLaunchKernelGGLImpl( } // Namespace hip_impl. -template -inline -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, - F kernel, size_t dynSharedMemPerBlk, uint32_t blockSizeLimit) { - - using namespace hip_impl; - - hip_impl::hip_init(); - auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), - target_agent(0)); - - return hipOccupancyMaxPotentialBlockSize(gridSize, blockSize, f, - dynSharedMemPerBlk, blockSizeLimit); -} - -template -inline -hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor(uint32_t* numBlocks, F kernel, - uint32_t blockSize, size_t dynSharedMemPerBlk) { - - using namespace hip_impl; - - hip_impl::hip_init(); - auto f = get_program_state().kernel_descriptor(reinterpret_cast(kernel), - target_agent(0)); - - return hipOccupancyMaxActiveBlocksPerMultiprocessor(numBlocks, f, blockSize, dynSharedMemPerBlk); -} - template inline void hipLaunchKernelGGL(F kernel, const dim3& numBlocks, const dim3& dimBlocks, diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h index b4402fd67a..3a81305ba3 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_api.h @@ -2884,14 +2884,14 @@ hipError_t hipLaunchCooperativeKernelMultiDevice(hipLaunchParams* launchParamsLi * @param [out] gridSize minimum grid size for maximum potential occupancy * @param [out] blockSize block size for maximum potential occupancy * @param [in] f kernel function for which occupancy is calulated - * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes * @param [in] blockSizeLimit the maximum block size for the kernel, use 0 for no limit * * @returns hipSuccess, hipInvalidDevice, hipErrorInvalidValue */ -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit); +hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + const void* f, size_t dynamicSMemSize, + int blockSizeLimit); /** * @brief Returns occupancy for a device function. @@ -2899,10 +2899,10 @@ hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* block * @param [out] numBlocks Returned occupancy * @param [in] func Kernel function for which occupancy is calulated * @param [in] blockSize Block size the kernel is intended to be launched with - * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk); + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize); /** * @brief Returns occupancy for a device function. @@ -2910,11 +2910,11 @@ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( * @param [out] numBlocks Returned occupancy * @param [in] func Kernel function for which occupancy is calulated * @param [in] blockSize Block size the kernel is intended to be launched with - * @param [in] dynSharedMemPerBlk dynamic shared memory usage (in bytes) intended for each block + * @param [in] dynamicSMemSize Per - block dynamic shared memory usage intended, in bytes * @param [in] flags Extra flags for occupancy calculation (currently ignored) */ hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, unsigned int flags); + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags); /** * @brief Launches kernels on multiple devices and guarantees all specified kernels are dispatched @@ -3320,7 +3320,27 @@ hipError_t hipBindTextureToMipmappedArray(const texture& tex, return hipSuccess; } +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( + int* numBlocks, T f, int blockSize, size_t dynamicSMemSize) { + return hipOccupancyMaxActiveBlocksPerMultiprocessor( + numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize); +} +template +inline hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + int* numBlocks, T f, int blockSize, size_t dynamicSMemSize, unsigned int flags) { + return hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( + numBlocks, reinterpret_cast(f), blockSize, dynamicSMemSize, flags); +} + +template +inline hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + T f, size_t dynamicSMemSize, int blockSizeLimit) { + return hipOccupancyMaxPotentialBlockSize( + gridSize, blockSize, reinterpret_cast(f), dynamicSMemSize, blockSizeLimit); +} + template inline hipError_t hipLaunchCooperativeKernel(T f, dim3 gridDim, dim3 blockDim, void** kernelParams, unsigned int sharedMemBytes, hipStream_t stream) { diff --git a/projects/clr/hipamd/samples/2_Cookbook/13_occupancy/occupancy.cpp b/projects/clr/hipamd/samples/2_Cookbook/13_occupancy/occupancy.cpp index a9f4e198b0..01fa7aafed 100644 --- a/projects/clr/hipamd/samples/2_Cookbook/13_occupancy/occupancy.cpp +++ b/projects/clr/hipamd/samples/2_Cookbook/13_occupancy/occupancy.cpp @@ -56,9 +56,9 @@ void launchKernel(float* C, float* A, float* B, bool manual){ const unsigned threadsperblock = 32; const unsigned blocks = (NUM/threadsperblock)+1; - uint32_t mingridSize = 0; - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int mingridSize = 0; + int gridSize = 0; + int blockSize = 0; if (manual){ blockSize = threadsperblock; @@ -86,7 +86,7 @@ void launchKernel(float* C, float* A, float* B, bool manual){ printf("kernel Execution time = %6.3fms\n", eventMs); //Calculate Occupancy - uint32_t numBlock = 0; + int numBlock = 0; HIP_CHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, multiply, blockSize, 0)); if(devProp.maxThreadsPerMultiProcessor){ diff --git a/projects/clr/hipamd/src/hip_module.cpp b/projects/clr/hipamd/src/hip_module.cpp index 2afbabf0a8..c15cc34cd5 100644 --- a/projects/clr/hipamd/src/hip_module.cpp +++ b/projects/clr/hipamd/src/hip_module.cpp @@ -1248,9 +1248,9 @@ void getGprsLdsUsage(hipFunction_t f, size_t* usedVGPRS, size_t* usedSGPRS, size } } -hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, uint32_t* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit) +hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, int* gridSize, int* blockSize, + hipFunction_t f, size_t dynamicSMemSize, + int blockSizeLimit) { using namespace hip_impl; @@ -1331,7 +1331,7 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, } else { size_t availableSharedMemPerCU = prop.maxSharedMemoryPerMultiProcessor; - size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynSharedMemPerBlk); + size_t workgroupPerCU = availableSharedMemPerCU / (usedLDS + dynamicSMemSize); wavefrontsLDS = min(workgroupPerCU, maxWorkgroupPerCU) * wavefrontsPerWG; } @@ -1360,18 +1360,19 @@ hipError_t ihipOccupancyMaxPotentialBlockSize(TlsData *tls, uint32_t* gridSize, return hipSuccess; } -hipError_t hipOccupancyMaxPotentialBlockSize(uint32_t* gridSize, uint32_t* blockSize, - hipFunction_t f, size_t dynSharedMemPerBlk, - uint32_t blockSizeLimit) +hipError_t hipOccupancyMaxPotentialBlockSize(int* gridSize, int* blockSize, + const void* f, size_t dynamicSMemSize, + int blockSizeLimit) { - HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit); - + HIP_INIT_API(hipOccupancyMaxPotentialBlockSize, gridSize, blockSize, f, dynamicSMemSize, blockSizeLimit); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxPotentialBlockSize(tls, - gridSize, blockSize, f, dynSharedMemPerBlk, blockSizeLimit)); + gridSize, blockSize, F, dynamicSMemSize, blockSizeLimit)); } hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( - TlsData *tls, uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + TlsData *tls, int* numBlocks, hipFunction_t f, int blockSize, size_t dynamicSMemSize) { using namespace hip_impl; @@ -1411,35 +1412,39 @@ hipError_t ihipOccupancyMaxActiveBlocksPerMultiprocessor( : std::min(maxWavesPerSimd, availableSGPRs / usedSGPRS)); // Calculate blocks occupancy per CU based on SGPR usage - *numBlocks = std::min(*numBlocks, (uint32_t) (sgprs_alu_occupancy / numWavefronts)); + *numBlocks = std::min(*numBlocks, (int) (sgprs_alu_occupancy / numWavefronts)); - size_t total_used_lds = usedLDS + dynSharedMemPerBlk; + size_t total_used_lds = usedLDS + dynamicSMemSize; if (total_used_lds != 0) { // Calculate LDS occupacy per CU. lds_per_cu / (static_lsd + dynamic_lds) size_t lds_occupancy = prop.maxSharedMemoryPerMultiProcessor / total_used_lds; - *numBlocks = std::min(*numBlocks, (uint32_t) lds_occupancy); + *numBlocks = std::min(*numBlocks, (int) lds_occupancy); } return hipSuccess; } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessor( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk) + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize) { - HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynSharedMemPerBlk); + HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessor, numBlocks, f, blockSize, dynamicSMemSize); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynamicSMemSize)); } hipError_t hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags( - uint32_t* numBlocks, hipFunction_t f, uint32_t blockSize, size_t dynSharedMemPerBlk, + int* numBlocks, const void* f, int blockSize, size_t dynamicSMemSize, unsigned int flags) { - HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynSharedMemPerBlk, flags); + HIP_INIT_API(hipOccupancyMaxActiveBlocksPerMultiprocessorWithFlags, numBlocks, f, blockSize, dynamicSMemSize, flags); + auto F = hip_impl::get_program_state().kernel_descriptor((std::uintptr_t)(f), + hip_impl::target_agent(0)); return ihipLogStatus(ihipOccupancyMaxActiveBlocksPerMultiprocessor( - tls, numBlocks, f, blockSize, dynSharedMemPerBlk)); + tls, numBlocks, F, blockSize, dynamicSMemSize)); } hipError_t hipLaunchKernel( diff --git a/projects/clr/hipamd/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp b/projects/clr/hipamd/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp index 89d003ea94..8089f26f16 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/module/hipLaunchCooperativeKernel.cpp @@ -116,7 +116,7 @@ int main() { dimBlock.x = workgroups[i]; // Calculate the device occupancy to know how many blocks can be run concurrently - hipOccupancyMaxActiveBlocksPerMultiprocessor(reinterpret_cast(&numBlocks), + hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlocks, test_gws, dimBlock.x * dimBlock.y * dimBlock.z, dimBlock.x * sizeof(long)); dimGrid.x = deviceProp.multiProcessorCount * std::min(numBlocks, 32); diff --git a/projects/clr/hipamd/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp b/projects/clr/hipamd/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp index ebf656b72f..2838c09cd1 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/module/hipOccupancyMaxActiveBlocksPerMultiprocessor.cpp @@ -30,10 +30,6 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#define fileName "vcpy_kernel.code" -#define kernel_name "hello_world" - - __global__ void f1(float *a) { *a = 1.0; } template @@ -44,16 +40,15 @@ __global__ void f2(T *a) { *a = 1; } int main(int argc, char* argv[]) { // test case for using kernel function pointer - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int gridSize = 0; + int blockSize = 0; hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); assert(gridSize != 0 && blockSize != 0); - uint32_t numBlock = 0; + int numBlock = 0; hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f1, blockSize, 0); assert(numBlock != 0); - // test case for using kernel function pointer with template gridSize = 0; blockSize = 0; @@ -64,15 +59,5 @@ int main(int argc, char* argv[]) { hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, f2, blockSize, 0); assert(numBlock != 0); - - // test case for using kernel with hipFunction_t type - numBlock = 0; - hipModule_t Module; - hipFunction_t Function; - HIPCHECK(hipModuleLoad(&Module, fileName)); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - HIPCHECK(hipOccupancyMaxActiveBlocksPerMultiprocessor(&numBlock, Function, blockSize, 0)); - assert(numBlock != 0); - passed(); } diff --git a/projects/clr/hipamd/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp b/projects/clr/hipamd/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp index a81862952d..22a3f05283 100644 --- a/projects/clr/hipamd/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp +++ b/projects/clr/hipamd/tests/src/runtimeApi/module/hipOccupancyMaxPotentialBlockSize.cpp @@ -30,22 +30,16 @@ THE SOFTWARE. #include "hip/hip_runtime.h" #include "test_common.h" -#define fileName "vcpy_kernel.code" -#define kernel_name "hello_world" - - __global__ void f1(float *a) { *a = 1.0; } template __global__ void f2(T *a) { *a = 1; } - - int main(int argc, char* argv[]) { // test case for using kernel function pointer - uint32_t gridSize = 0; - uint32_t blockSize = 0; + int gridSize = 0; + int blockSize = 0; hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f1, 0, 0); assert(gridSize != 0 && blockSize != 0); @@ -55,15 +49,5 @@ int main(int argc, char* argv[]) { hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, f2, 0, 0); assert(gridSize != 0 && blockSize != 0); - // test case for using kernel with hipFunction_t type - gridSize = 0; - blockSize = 0; - hipModule_t Module; - hipFunction_t Function; - HIPCHECK(hipModuleLoad(&Module, fileName)); - HIPCHECK(hipModuleGetFunction(&Function, Module, kernel_name)); - HIPCHECK(hipOccupancyMaxPotentialBlockSize(&gridSize, &blockSize, Function, 0, 0)); - assert(gridSize != 0 && blockSize != 0); - passed(); } From 5b917afa5f6eb80acafe4b755d4ad9e8d6a56f32 Mon Sep 17 00:00:00 2001 From: Alex Voicu Date: Thu, 24 Oct 2019 13:15:20 +0100 Subject: [PATCH 41/45] Make CAS loops use the TTAS idiom. (#1573) * Make CAS loops use the TTAS idiom. * More efficient re-formulation of TTAS. * Fix typo. * The typo was not quite a typo [ROCm/clr commit: 26914ec76e052461694691e2c2ad25ec4330caf6] --- .../include/hip/hcc_detail/hip_atomic.h | 42 ++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_atomic.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_atomic.h index a5ac94a74b..2c13411319 100644 --- a/projects/clr/hipamd/include/hip/hcc_detail/hip_atomic.h +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_atomic.h @@ -59,12 +59,17 @@ float atomicAdd(float* address, float val) { unsigned int* uaddr{reinterpret_cast(address)}; unsigned int old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; - unsigned int r; + unsigned int r; do { - r = old; + r = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + + if (r != old) { r = old; continue; } + old = atomicCAS(uaddr, r, __float_as_uint(val + __uint_as_float(r))); - } while (r != old); + + if (r == old) break; + } while (true); return __uint_as_float(r); } @@ -74,13 +79,18 @@ double atomicAdd(double* address, double val) { unsigned long long* uaddr{reinterpret_cast(address)}; unsigned long long old{__atomic_load_n(uaddr, __ATOMIC_RELAXED)}; - unsigned long long r; + unsigned long long r; do { - r = old; + r = __atomic_load_n(uaddr, __ATOMIC_RELAXED); + + if (r != old) { r = old; continue; } + old = atomicCAS( uaddr, r, __double_as_longlong(val + __longlong_as_double(r))); - } while (r != old); + + if (r == old) break; + } while (true); return __longlong_as_double(r); } @@ -144,7 +154,13 @@ unsigned long long atomicMin( unsigned long long* address, unsigned long long val) { unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; - while (val < tmp) { tmp = atomicCAS(address, tmp, val); } + while (val < tmp) { + const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED); + + if (tmp1 != tmp) { tmp = tmp1; continue; } + + tmp = atomicCAS(address, tmp, val); + } return tmp; } @@ -167,7 +183,13 @@ unsigned long long atomicMax( unsigned long long* address, unsigned long long val) { unsigned long long tmp{__atomic_load_n(address, __ATOMIC_RELAXED)}; - while (tmp < val) { tmp = atomicCAS(address, tmp, val); } + while (tmp < val) { + const auto tmp1 = __atomic_load_n(address, __ATOMIC_RELAXED); + + if (tmp1 != tmp) { tmp = tmp1; continue; } + + tmp = atomicCAS(address, tmp, val); + } return tmp; } @@ -177,7 +199,7 @@ inline unsigned int atomicInc(unsigned int* address, unsigned int val) { __device__ - extern + extern unsigned int __builtin_amdgcn_atomic_inc( unsigned int*, unsigned int, @@ -194,7 +216,7 @@ inline unsigned int atomicDec(unsigned int* address, unsigned int val) { __device__ - extern + extern unsigned int __builtin_amdgcn_atomic_dec( unsigned int*, unsigned int, From 21a2925ee7ff696eac7f5a1a4585016dbdeac106 Mon Sep 17 00:00:00 2001 From: gandryey <56892148+gandryey@users.noreply.github.com> Date: Thu, 24 Oct 2019 08:15:42 -0400 Subject: [PATCH 42/45] Hip vdi profiling header (#1577) Add HIP-VDI profiling interface for GPU timing collection. [ROCm/clr commit: f25692b3999246a8d202956cf71e5eed0024862b] --- .../include/hip/hcc_detail/hip_runtime_prof.h | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_prof.h diff --git a/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_prof.h b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_prof.h new file mode 100644 index 0000000000..4d4eccb54d --- /dev/null +++ b/projects/clr/hipamd/include/hip/hcc_detail/hip_runtime_prof.h @@ -0,0 +1,77 @@ +/* +Copyright (c) 2019 - present Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H +#define HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H + +// HIP VDI Op IDs enumeration +enum HipVdiOpId { + kHipVdiOpIdDispatch = 0, + kHipVdiOpIdCopy = 1, + kHipVdiOpIdBarrier = 2, + kHipVdiOpIdNumber = 3 +}; + +// Types of VDI commands +enum HipVdiCommandKind { + kHipVdiCommandKernel = 0x11F0, + kHipVdiMemcpyDeviceToHost = 0x11F3, + kHipHipVdiMemcpyHostToDevice = 0x11F4, + kHipVdiMemcpyDeviceToDevice = 0x11F5, + kHipVidMemcpyDeviceToHostRect = 0x1201, + kHipVdiMemcpyHostToDeviceRect = 0x1202, + kHipVdiMemcpyDeviceToDeviceRect = 0x1203, + kHipVdiFillMemory = 0x1207, +}; + +/** + * @brief Initializes activity callback + * + * @param [input] id_callback Event ID callback function + * @param [input] op_callback Event operation callback function + * @param [input] arg Arguments passed into callback + * + * @returns None + */ +void hipInitActivityCallback(void* id_callback, void* op_callback, void* arg); + +/** + * @brief Enables activity callback + * + * @param [input] op Operation, which will trigger a callback (@see HipVdiOpId) + * @param [input] enable Enable state for the callback + * + * @returns True if successful + */ +bool hipEnableActivityCallback(uint32_t op, bool enable); + +/** + * @brief Returns the description string for the operation kind + * + * @param [input] id Command kind id (@see HipVdiCommandKind) + * + * @returns A pointer to a const string with the command description + */ +const char* hipGetCmdName(uint32_t id); + +#endif // HIP_INCLUDE_HIP_HCC_DETAIL_HIP_RUNTIME_PROF_H + From 7f429afe2ece09dba5c8048c55856222d9fd4d4f Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 24 Oct 2019 05:15:51 -0700 Subject: [PATCH 43/45] Add HIP checks in texture driver sample (#1581) [ROCm/clr commit: 170c4f02705854edc46090f876c6f16eaa3cfa62] --- .../11_texture_driver/texture2dDrv.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/projects/clr/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp b/projects/clr/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp index b3c1ef5d0c..3be6d12b31 100755 --- a/projects/clr/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp +++ b/projects/clr/hipamd/samples/2_Cookbook/11_texture_driver/texture2dDrv.cpp @@ -62,7 +62,7 @@ bool runTest(int argc, char** argv) { desc.NumChannels = 1; desc.Width = width; desc.Height = height; - hipArrayCreate(&array, &desc); + HIP_CHECK(hipArrayCreate(&array, &desc)); hip_Memcpy2D copyParam; memset(©Param, 0, sizeof(copyParam)); @@ -73,19 +73,19 @@ bool runTest(int argc, char** argv) { copyParam.srcPitch = width * sizeof(float); copyParam.WidthInBytes = copyParam.srcPitch; copyParam.Height = height; - hipMemcpyParam2D(©Param); + HIP_CHECK(hipMemcpyParam2D(©Param)); textureReference* texref; - hipModuleGetTexRef(&texref, Module, "tex"); - hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap); - hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap); - hipTexRefSetFilterMode(texref, hipFilterModePoint); - hipTexRefSetFlags(texref, 0); - hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1); - hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT); + HIP_CHECK(hipModuleGetTexRef(&texref, Module, "tex")); + HIP_CHECK(hipTexRefSetAddressMode(texref, 0, hipAddressModeWrap)); + HIP_CHECK(hipTexRefSetAddressMode(texref, 1, hipAddressModeWrap)); + HIP_CHECK(hipTexRefSetFilterMode(texref, hipFilterModePoint)); + HIP_CHECK(hipTexRefSetFlags(texref, 0)); + HIP_CHECK(hipTexRefSetFormat(texref, HIP_AD_FORMAT_FLOAT, 1)); + HIP_CHECK(hipTexRefSetArray(texref, array, HIP_TRSA_OVERRIDE_FORMAT)); float* dData = NULL; - hipMalloc((void**)&dData, size); + HIP_CHECK(hipMalloc((void**)&dData, size)); struct { void* _Ad; @@ -112,7 +112,7 @@ bool runTest(int argc, char** argv) { float* hOutputData = (float*)malloc(size); memset(hOutputData, 0, size); - hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost); + HIP_CHECK(hipMemcpy(hOutputData, dData, size, hipMemcpyDeviceToHost)); for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { @@ -124,8 +124,8 @@ bool runTest(int argc, char** argv) { } } } - hipFree(dData); - hipFreeArray(array); + HIP_CHECK(hipFree(dData)); + HIP_CHECK(hipFreeArray(array)); return testResult; } From e7f4cf4487636cd0f5792ddbe7dd6b9e6ec49746 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary <51944368+cjatin@users.noreply.github.com> Date: Thu, 24 Oct 2019 17:46:06 +0530 Subject: [PATCH 44/45] Adding New Analyze Target Merging with cppcheck (#1583) [ROCm/clr commit: f53b1a1755dd540b57020f6f448e6ddf383cd0f0] --- projects/clr/hipamd/CMakeLists.txt | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/projects/clr/hipamd/CMakeLists.txt b/projects/clr/hipamd/CMakeLists.txt index 28b8683b22..57369a9039 100644 --- a/projects/clr/hipamd/CMakeLists.txt +++ b/projects/clr/hipamd/CMakeLists.txt @@ -504,7 +504,7 @@ endif() find_program(CPPCHECK_EXE cppcheck) if(CPPCHECK_EXE) add_custom_target(cppcheck COMMAND ${CPPCHECK_EXE} --force --quiet --enable=warning,performance,portability,information,missingInclude src include -I /opt/rocm/include/hcc -I /opt/rocm/include --suppress=*:/opt/rocm/include/hcc/hc.hpp - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() ############################# @@ -545,4 +545,17 @@ else() message(STATUS "Testing targets will not be available. To enable them please ensure that the HIP installation directory is writeable. Use -DCMAKE_INSTALL_PREFIX to specify a suitable location") endif() +############################# +# Code analysis +############################# +# Target: clang +if(HIP_HIPCC_EXECUTABLE) + add_custom_target(analyze + COMMAND ${HIP_HIPCC_EXECUTABLE} -fvisibility=hidden -fvisibility-inlines-hidden --analyze --analyzer-outputtext -isystem /opt/rocm/include ${HIP_HCC_BUILD_FLAGS} -Wno-unused-command-line-argument -I/opt/rocm/include -c src/*.cpp -Iinclude/ -I./ + WORKING_DIRECTORY ${HIP_SRC_PATH}) + if(CPPCHECK_EXE) + add_dependencies(analyze cppcheck) + endif() +endif() + # vim: ts=4:sw=4:expandtab:smartindent From 6760e4065ee5d15d8652896fcaa6d13c2029a9fd Mon Sep 17 00:00:00 2001 From: Rahul Garg Date: Thu, 24 Oct 2019 05:21:55 -0700 Subject: [PATCH 45/45] Update profiling doc (#1576) [ROCm/clr commit: ff8d3fa446ce9b8d0b19a423c1c414d7f86a6e68] --- projects/clr/hipamd/docs/markdown/hip_profiling.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/projects/clr/hipamd/docs/markdown/hip_profiling.md b/projects/clr/hipamd/docs/markdown/hip_profiling.md index 8a44368680..28ed37e321 100644 --- a/projects/clr/hipamd/docs/markdown/hip_profiling.md +++ b/projects/clr/hipamd/docs/markdown/hip_profiling.md @@ -196,8 +196,7 @@ This file can be copied and edited to provide more selective HSA event recording #### How to enable profiling at HIP build time -Recent pre-built packages of HIP are always built with profiling support enabled. -For developer builds, you must enable marker support manually when compiling HIP. +Pre-built packages of HIP are not built with profiling support enabled.You must enable marker support manually when compiling HIP. 1. Build HIP with ATP markers enabled HIP pre-built packages are enabled with ATP marker support by default.