diff --git a/hipamd/bin/hipcc b/hipamd/bin/hipcc index 8f18ab0d8e..b991bd40aa 100755 --- a/hipamd/bin/hipcc +++ b/hipamd/bin/hipcc @@ -127,7 +127,9 @@ if ($HIP_PLATFORM eq "hcc") { # Force -stdlib=libc++ on UB14.04 $HOST_OSVER= `cat /etc/os-release | grep "^VERSION_ID\=" | cut -d= -f2 | tr -d '\n'`; - if ($HOST_OSNAME eq "ubuntu" and $HOST_OSVER eq "\"14.04\"") { + if (($HOST_OSNAME eq "ubuntu" and $HOST_OSVER eq "\"14.04\"") + or ($HOST_OSNAME eq "\"centos\"" and $HOST_OSVER eq "\"7\"") + or ($HOST_OSNAME eq "\"rhel\"" and $HOST_OSVER eq "\"7.4\"")) { $HIPCXXFLAGS .= " -stdlib=libc++"; $setStdLib = 1; } @@ -136,7 +138,6 @@ if ($HIP_PLATFORM eq "hcc") { $HIPCXXFLAGS .= " -I$HSA_PATH/include"; $HIPCXXFLAGS .= " -Wno-deprecated-register"; - $HIPLDFLAGS .= " -lsupc++"; $HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt "; # $HIPLDFLAGS .= " -L$HCC_HOME/compiler/lib -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMMC -lLLVMCore -lLLVMSupport "; @@ -438,6 +439,7 @@ if($HIP_PLATFORM eq "hcc"){ if ($target_gfx900 eq 1) { $HIPLDFLAGS .= " --amdgpu-target=gfx900"; $HIPCXXFLAGS .= " -D__HIP_ARCH_GFX900__=1 "; + $ENV{HCC_EXTRA_LIBRARIES_GFX900}="$HIP_PATH/lib/hip_hc_gfx803.ll\n"; } } diff --git a/hipamd/hipify-clang/src/ArgParse.cpp b/hipamd/hipify-clang/src/ArgParse.cpp new file mode 100644 index 0000000000..b27643710f --- /dev/null +++ b/hipamd/hipify-clang/src/ArgParse.cpp @@ -0,0 +1,40 @@ +#include "ArgParse.h" + +cl::OptionCategory ToolTemplateCategory("CUDA to HIP source translator options"); + +cl::opt OutputFilename("o", + cl::desc("Output filename"), + cl::value_desc("filename"), + cl::cat(ToolTemplateCategory)); + +cl::opt Inplace("inplace", + cl::desc("Modify input file inplace, replacing input with hipified output, save backup in .prehip file"), + cl::value_desc("inplace"), + cl::cat(ToolTemplateCategory)); + +cl::opt NoBackup("no-backup", + cl::desc("Don't create a backup file for the hipified source"), + cl::value_desc("no-backup"), + cl::cat(ToolTemplateCategory)); + +cl::opt NoOutput("no-output", + cl::desc("Don't write any translated output to stdout"), + cl::value_desc("no-output"), + cl::cat(ToolTemplateCategory)); + +cl::opt PrintStats("print-stats", + cl::desc("Print translation statistics"), + cl::value_desc("print-stats"), + cl::cat(ToolTemplateCategory)); + +cl::opt OutputStatsFilename("o-stats", + cl::desc("Output filename for statistics"), + cl::value_desc("filename"), + cl::cat(ToolTemplateCategory)); + +cl::opt Examine("examine", + cl::desc("Combines -no-output and -print-stats options"), + cl::value_desc("examine"), + cl::cat(ToolTemplateCategory)); + +cl::extrahelp CommonHelp(ct::CommonOptionsParser::HelpMessage); diff --git a/hipamd/hipify-clang/src/ArgParse.h b/hipamd/hipify-clang/src/ArgParse.h new file mode 100644 index 0000000000..b937a8dd15 --- /dev/null +++ b/hipamd/hipify-clang/src/ArgParse.h @@ -0,0 +1,19 @@ +#pragma once + +#include "clang/Tooling/CommonOptionsParser.h" +#include "llvm/Support/CommandLine.h" + +namespace cl = llvm::cl; +namespace ct = clang::tooling; + +extern cl::OptionCategory ToolTemplateCategory; + +extern cl::opt OutputFilename; +extern cl::opt Inplace; +extern cl::opt NoBackup; +extern cl::opt NoOutput; +extern cl::opt PrintStats; +extern cl::opt OutputStatsFilename; +extern cl::opt Examine; + +extern cl::extrahelp CommonHelp; diff --git a/hipamd/hipify-clang/src/CUDA2HipMap.cpp b/hipamd/hipify-clang/src/CUDA2HipMap.cpp index de6ddb2d74..8e76b5fdde 100644 --- a/hipamd/hipify-clang/src/CUDA2HipMap.cpp +++ b/hipamd/hipify-clang/src/CUDA2HipMap.cpp @@ -312,9 +312,16 @@ const std::map CUDA_TYPE_NAME_MAP{ /// Maps cuda header names to hip header names. const std::map CUDA_INCLUDE_MAP{ // CUDA includes - {"cuda.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}}, - {"cuda_runtime.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}}, - {"cuda_runtime_api.h", {"hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuda.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}}, + {"cuda_runtime.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}}, + {"cuda_runtime_api.h", {"hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}}, + {"channel_descriptor.h", {"hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME}}, + {"device_functions.h", {"hip/device_functions.h", CONV_INCLUDE, API_RUNTIME}}, + {"driver_types.h", {"hip/driver_types.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuComplex.h", {"hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuda_fp16.h", {"hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME}}, + {"cuda_texture_types.h", {"hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME}}, + {"vector_types.h", {"hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME}}, // CUBLAS includes {"cublas.h", {"hipblas.h", CONV_INCLUDE, API_BLAS}}, @@ -2677,7 +2684,6 @@ const std::map& CUDA_RENAMES_MAP() { // First run, so compute the union map. ret = CUDA_IDENTIFIER_MAP; - ret.insert(CUDA_INCLUDE_MAP.begin(), CUDA_INCLUDE_MAP.end()); ret.insert(CUDA_TYPE_NAME_MAP.begin(), CUDA_TYPE_NAME_MAP.end()); return ret; diff --git a/hipamd/hipify-clang/src/Cuda2Hip.cpp b/hipamd/hipify-clang/src/Cuda2Hip.cpp deleted file mode 100644 index a1cf80fde9..0000000000 --- a/hipamd/hipify-clang/src/Cuda2Hip.cpp +++ /dev/null @@ -1,861 +0,0 @@ -/* -Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -/** - * @file Cuda2Hip.cpp - * - * This file is compiled and linked into clang based hipify tool. - */ -#include "clang/ASTMatchers/ASTMatchFinder.h" -#include "clang/ASTMatchers/ASTMatchers.h" -#include "clang/Basic/SourceManager.h" -#include "clang/Frontend/CompilerInstance.h" -#include "clang/Frontend/FrontendActions.h" -#include "clang/Frontend/TextDiagnosticPrinter.h" -#include "clang/Lex/Lexer.h" -#include "clang/Lex/MacroArgs.h" -#include "clang/Lex/MacroInfo.h" -#include "clang/Lex/PPCallbacks.h" -#include "clang/Lex/Preprocessor.h" -#include "clang/Rewrite/Core/Rewriter.h" -#include "clang/Tooling/CommonOptionsParser.h" -#include "clang/Tooling/Refactoring.h" -#include "clang/Tooling/Tooling.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MemoryBuffer.h" -#include "llvm/Support/Signals.h" - -#include -#include -#include -#include -#include -#include -#include - -#include "CUDA2HipMap.h" -#include "LLVMCompat.h" -#include "StringUtils.h" - -using namespace clang; -using namespace clang::ast_matchers; -using namespace clang::tooling; -using namespace llvm; - -#define DEBUG_TYPE "cuda2hip" - - -// Set up the command line options -static cl::OptionCategory ToolTemplateCategory("CUDA to HIP source translator options"); - -static cl::opt OutputFilename("o", - cl::desc("Output filename"), - cl::value_desc("filename"), - cl::cat(ToolTemplateCategory)); - -static cl::opt Inplace("inplace", - cl::desc("Modify input file inplace, replacing input with hipified output, save backup in .prehip file"), - cl::value_desc("inplace"), - cl::cat(ToolTemplateCategory)); - -static cl::opt NoBackup("no-backup", - cl::desc("Don't create a backup file for the hipified source"), - cl::value_desc("no-backup"), - cl::cat(ToolTemplateCategory)); - -static cl::opt NoOutput("no-output", - cl::desc("Don't write any translated output to stdout"), - cl::value_desc("no-output"), - cl::cat(ToolTemplateCategory)); - -static cl::opt PrintStats("print-stats", - cl::desc("Print translation statistics"), - cl::value_desc("print-stats"), - cl::cat(ToolTemplateCategory)); - -static cl::opt OutputStatsFilename("o-stats", - cl::desc("Output filename for statistics"), - cl::value_desc("filename"), - cl::cat(ToolTemplateCategory)); - -static cl::opt Examine("examine", - cl::desc("Combines -no-output and -print-stats options"), - cl::value_desc("examine"), - cl::cat(ToolTemplateCategory)); - -static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage); - -class Cuda2Hip { -public: - Cuda2Hip(Replacements& R, const std::string &srcFileName) : - Replace(R), mainFileName(srcFileName) {} - - enum msgTypes { - HIPIFY_ERROR = 0, - HIPIFY_WARNING - }; - - std::string getMsgType(msgTypes type) { - switch (type) { - case HIPIFY_ERROR: return "error"; - default: - case HIPIFY_WARNING: return "warning"; - } - } - -protected: - Replacements& Replace; - std::string mainFileName; - - virtual void insertReplacement(const Replacement &rep, const FullSourceLoc &fullSL) { - llcompat::insertReplacement(Replace, rep); - if (PrintStats) { - rep.getLength(); - Statistics::current().lineTouched(fullSL.getExpansionLineNumber()); - Statistics::current().bytesChanged(rep.getLength()); - } - } - void insertHipHeaders(Cuda2Hip *owner, const SourceManager &SM) { - if (Replace.size() > 0) { - std::string repName = "#include "; - Statistics::current().incrementCounter({repName, ConvTypes::CONV_INCLUDE_CUDA_MAIN_H, ApiTypes::API_RUNTIME}, "#include "); - SourceLocation sl = SM.getLocForStartOfFile(SM.getMainFileID()); - FullSourceLoc fullSL(sl, SM); - Replacement Rep(SM, sl, 0, repName + "\n"); - insertReplacement(Rep, fullSL); - } - } - - void printHipifyMessage(const SourceManager &SM, const SourceLocation &sl, const std::string &message, msgTypes msgType = HIPIFY_WARNING) { - FullSourceLoc fullSL(sl, SM); - llvm::errs() << "[HIPIFY] " << getMsgType(msgType) << ": " << mainFileName << ":" << fullSL.getExpansionLineNumber() << ":" << fullSL.getExpansionColumnNumber() << ": " << message << "\n"; - } - - void processString(StringRef s, SourceManager &SM, SourceLocation start) { - size_t begin = 0; - while ((begin = s.find("cu", begin)) != StringRef::npos) { - const size_t end = s.find_first_of(" ", begin + 4); - StringRef name = s.slice(begin, end); - const auto found = CUDA_RENAMES_MAP().find(name); - if (found != CUDA_RENAMES_MAP().end()) { - StringRef repName = found->second.hipName; - hipCounter counter = {"[string literal]", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.unsupported}; - Statistics::current().incrementCounter(counter, name.str()); - if (!counter.unsupported) { - SourceLocation sl = start.getLocWithOffset(begin + 1); - Replacement Rep(SM, sl, name.size(), repName); - FullSourceLoc fullSL(sl, SM); - insertReplacement(Rep, fullSL); - } - } - - if (end == StringRef::npos) { - break; - } - begin = end + 1; - } - } -}; - -class Cuda2HipCallback; - -class HipifyPPCallbacks : public PPCallbacks, public SourceFileCallbacks, public Cuda2Hip { -public: - HipifyPPCallbacks(Replacements& R, const std::string &mainFileName) - : Cuda2Hip(R, mainFileName) {} - - virtual bool handleBeginSource(CompilerInstance &CI -#if LLVM_VERSION_MAJOR <= 4 - , StringRef Filename -#endif - ) override { - Preprocessor &PP = CI.getPreprocessor(); - SourceManager &SM = CI.getSourceManager(); - setSourceManager(&SM); - PP.addPPCallbacks(std::unique_ptr(this)); - setPreprocessor(&PP); - return true; - } - - virtual void handleEndSource() override; - - virtual void InclusionDirective(SourceLocation hash_loc, - const Token &include_token, - StringRef file_name, bool is_angled, - CharSourceRange filename_range, - const FileEntry *file, StringRef search_path, - StringRef relative_path, - const clang::Module *imported) override { - if (!_sm->isWrittenInMainFile(hash_loc) || !is_angled) { - return; // We're looking to rewrite angle-includes in the main file to point to hip. - } - - const auto found = CUDA_INCLUDE_MAP.find(file_name); - if (found == CUDA_INCLUDE_MAP.end()) { - // Not a CUDA include - don't touch it. - return; - } - - Statistics::current().incrementCounter(found->second, file_name.str()); - if (found->second.unsupported) { - // An unsupported CUDA header? Oh dear. Print a warning. - printHipifyMessage(*_sm, hash_loc, "Unsupported CUDA header used: " + file_name.str()); - return; - } - - StringRef repName = found->second.hipName; - DEBUG(dbgs() << "Include file found: " << file_name << "\n" - << "SourceLocation: " - << filename_range.getBegin().printToString(*_sm) << "\n" - << "Will be replaced with " << repName << "\n"); - SourceLocation sl = filename_range.getBegin(); - SourceLocation sle = filename_range.getEnd(); - const char *B = _sm->getCharacterData(sl); - const char *E = _sm->getCharacterData(sle); - SmallString<128> tmpData; - Replacement Rep(*_sm, sl, E - B, Twine("<" + repName + ">").toStringRef(tmpData)); - FullSourceLoc fullSL(sl, *_sm); - insertReplacement(Rep, fullSL); - } - - /** - * Look at, and consider altering, a given token. - * - * If it's not a CUDA identifier, nothing happens. - * If it's an unsupported CUDA identifier, a warning is emitted. - * Otherwise, the source file is updated with the corresponding hipification. - */ - void RewriteToken(Token t) { - // String literals containing CUDA references need fixing... - if (t.is(tok::string_literal)) { - StringRef s(t.getLiteralData(), t.getLength()); - processString(unquoteStr(s), *_sm, t.getLocation()); - return; - } else if (!t.isAnyIdentifier()) { - // If it's neither a string nor an identifier, we don't care. - return; - } - - StringRef name = t.getIdentifierInfo()->getName(); - const auto found = CUDA_RENAMES_MAP().find(name); - if (found == CUDA_RENAMES_MAP().end()) { - // So it's an identifier, but not CUDA? Boring. - return; - } - Statistics::current().incrementCounter(found->second, name.str()); - - SourceLocation sl = t.getLocation(); - if (found->second.unsupported) { - // An unsupported identifier? Curses! Warn the user. - printHipifyMessage(*_sm, sl, "Unsupported CUDA identifier used: " + name.str()); - return; - } - - StringRef repName = found->second.hipName; - Replacement Rep(*_sm, sl, name.size(), repName); - FullSourceLoc fullSL(sl, *_sm); - insertReplacement(Rep, fullSL); - } - - virtual void MacroDefined(const Token &MacroNameTok, - const MacroDirective *MD) override { - if (!_sm->isWrittenInMainFile(MD->getLocation()) || - MD->getKind() != MacroDirective::MD_Define) { - return; - } - - for (auto T : MD->getMacroInfo()->tokens()) { - RewriteToken(T); - } - } - - virtual void MacroExpands(const Token &MacroNameTok, - const MacroDefinition &MD, SourceRange Range, - const MacroArgs *Args) override { - - if (!_sm->isWrittenInMainFile(MacroNameTok.getLocation())) { - return; // Macros in headers are not our concern. - } - - // Is the macro itself a CUDA identifier? If so, rewrite it - RewriteToken(MacroNameTok); - - // If it's a macro with arguments, rewrite all the arguments as hip, too. - for (unsigned int i = 0; Args && i < MD.getMacroInfo()->GET_NUM_ARGS(); i++) { - std::vector toks; - // Code below is a kind of stolen from 'MacroArgs::getPreExpArgument' - // to workaround the 'const' MacroArgs passed into this hook. - const Token *start = Args->getUnexpArgument(i); - size_t len = Args->getArgLength(start) + 1; - llcompat::EnterPreprocessorTokenStream(*_pp, start, len, false); - - do { - toks.push_back(Token()); - Token &tk = toks.back(); - _pp->Lex(tk); - } while (toks.back().isNot(tok::eof)); - - _pp->RemoveTopOfLexerStack(); - // end of stolen code - - for (auto tok : toks) { - RewriteToken(tok); - } - } - } - - void EndOfMainFile() override {} - - void setSourceManager(SourceManager *sm) { _sm = sm; } - void setPreprocessor(Preprocessor *pp) { _pp = pp; } - void setMatch(Cuda2HipCallback *match) { Match = match; } - -private: - SourceManager *_sm = nullptr; - Preprocessor *_pp = nullptr; - Cuda2HipCallback *Match = nullptr; -}; - -class Cuda2HipCallback : public MatchFinder::MatchCallback, public Cuda2Hip { -private: - bool cudaCall(const MatchFinder::MatchResult &Result) { - const CallExpr *call = Result.Nodes.getNodeAs("cudaCall"); - if (!call) { - return false; // Another handler will do it. - } - - const FunctionDecl *funcDcl = call->getDirectCallee(); - std::string name = funcDcl->getDeclName().getAsString(); - SourceManager *SM = Result.SourceManager; - SourceLocation sl = call->getLocStart(); - - // TODO: Make a lookup table just for functions to improve performance. - const auto found = CUDA_IDENTIFIER_MAP.find(name); - if (found == CUDA_IDENTIFIER_MAP.end()) { - std::string msg = "the following reference is not handled: '" + name + "' [function call]."; - printHipifyMessage(*SM, sl, msg); - return true; - } - - const hipCounter& hipCtr = found->second; - Statistics::current().incrementCounter(hipCtr, name); - - if (hipCtr.unsupported) { - return true; // Silently fail when you find an unsupported member. - // TODO: Print a warning with the diagnostics API? - } - - size_t length = name.size(); - Replacement Rep(*SM, sl, length, hipCtr.hipName); - FullSourceLoc fullSL(sl, *SM); - insertReplacement(Rep, fullSL); - - return true; - } - - SourceRange getReadRange(clang::SourceManager &SM, const SourceRange &exprRange) { - SourceLocation begin = exprRange.getBegin(); - SourceLocation end = exprRange.getEnd(); - - bool beginSafe = !SM.isMacroBodyExpansion(begin) || Lexer::isAtStartOfMacroExpansion(begin, SM, LangOptions{}); - bool endSafe = !SM.isMacroBodyExpansion(end) || Lexer::isAtEndOfMacroExpansion(end, SM, LangOptions{}); - - if (beginSafe && endSafe) { - return {SM.getFileLoc(begin), SM.getFileLoc(end)}; - } else { - return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)}; - } - } - - SourceRange getWriteRange(clang::SourceManager &SM, const SourceRange &exprRange) { - SourceLocation begin = exprRange.getBegin(); - SourceLocation end = exprRange.getEnd(); - - // If the range is contained within a macro, update the macro definition. - // Otherwise, use the file location and hope for the best. - if (!SM.isMacroBodyExpansion(begin) || !SM.isMacroBodyExpansion(end)) { - return {SM.getFileLoc(begin), SM.getFileLoc(end)}; - } - - return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)}; - } - - StringRef readSourceText(clang::SourceManager& SM, const SourceRange& exprRange) { - return Lexer::getSourceText(CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, LangOptions(), nullptr); - } - - /** - * Get a string representation of the expression `arg`, unless it's a defaulting function - * call argument, in which case get a 0. Used for building argument lists to kernel calls. - */ - std::string stringifyZeroDefaultedArg(SourceManager& SM, const Expr* arg) { - if (isa(arg)) { - return "0"; - } else { - return readSourceText(SM, arg->getSourceRange()); - } - } - - bool cudaLaunchKernel(const MatchFinder::MatchResult &Result) { - StringRef refName = "cudaLaunchKernel"; - if (const CUDAKernelCallExpr *launchKernel = Result.Nodes.getNodeAs(refName)) { - SmallString<40> XStr; - raw_svector_ostream OS(XStr); - - LangOptions DefaultLangOptions; - SourceManager *SM = Result.SourceManager; - - const Expr& calleeExpr = *(launchKernel->getCallee()); - OS << "hipLaunchKernelGGL(" << readSourceText(*SM, calleeExpr.getSourceRange()) << ", "; - - // Next up are the four kernel configuration parameters, the last two of which are optional and default to zero. - const CallExpr& config = *(launchKernel->getConfig()); - - // Copy the two dimensional arguments verbatim. - OS << "dim3(" << readSourceText(*SM, config.getArg(0)->getSourceRange()) << "), "; - OS << "dim3(" << readSourceText(*SM, config.getArg(1)->getSourceRange()) << "), "; - - // The stream/memory arguments default to zero if omitted. - OS << stringifyZeroDefaultedArg(*SM, config.getArg(2)) << ", "; - OS << stringifyZeroDefaultedArg(*SM, config.getArg(3)); - - // If there are ordinary arguments to the kernel, just copy them verbatim into our new call. - int numArgs = launchKernel->getNumArgs(); - if (numArgs > 0) { - OS << ", "; - - // Start of the first argument. - SourceLocation argStart = launchKernel->getArg(0)->getLocStart(); - - // End of the last argument. - SourceLocation argEnd = launchKernel->getArg(numArgs - 1)->getLocEnd(); - - OS << readSourceText(*SM, {argStart, argEnd}); - } - - OS << ")"; - - SourceRange replacementRange = getWriteRange(*SM, {launchKernel->getLocStart(), launchKernel->getLocEnd()}); - SourceLocation launchStart = replacementRange.getBegin(); - SourceLocation launchEnd = replacementRange.getEnd(); - - size_t length = SM->getCharacterData(Lexer::getLocForEndOfToken( - launchEnd, 0, *SM, DefaultLangOptions)) - - SM->getCharacterData(launchStart); - - Replacement Rep(*SM, launchStart, length, OS.str()); - FullSourceLoc fullSL(launchStart, *SM); - insertReplacement(Rep, fullSL); - hipCounter counter = {"hipLaunchKernelGGL", ConvTypes::CONV_KERN, ApiTypes::API_RUNTIME}; - Statistics::current().incrementCounter(counter, refName.str()); - return true; - } - return false; - } - - bool cudaBuiltin(const MatchFinder::MatchResult &Result) { - if (const MemberExpr *threadIdx = Result.Nodes.getNodeAs("cudaBuiltin")) { - if (const OpaqueValueExpr *refBase = - dyn_cast(threadIdx->getBase())) { - if (const DeclRefExpr *declRef = - dyn_cast(refBase->getSourceExpr())) { - SourceLocation sl = threadIdx->getLocStart(); - SourceManager *SM = Result.SourceManager; - StringRef name = declRef->getDecl()->getName(); - StringRef memberName = threadIdx->getMemberDecl()->getName(); - size_t pos = memberName.find_first_not_of("__fetch_builtin_"); - memberName = memberName.slice(pos, memberName.size()); - SmallString<128> tmpData; - name = Twine(name + "." + memberName).toStringRef(tmpData); - - // TODO: Make a lookup table just for builtins to improve performance. - const auto found = CUDA_IDENTIFIER_MAP.find(name); - if (found != CUDA_IDENTIFIER_MAP.end()) { - Statistics::current().incrementCounter(found->second, name.str()); - if (!found->second.unsupported) { - StringRef repName = found->second.hipName; - Replacement Rep(*SM, sl, name.size(), repName); - FullSourceLoc fullSL(sl, *SM); - insertReplacement(Rep, fullSL); - } - } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [builtin]."; - printHipifyMessage(*SM, sl, msg); - } - } - } - return true; - } - return false; - } - - bool cudaEnumConstantRef(const MatchFinder::MatchResult &Result) { - if (const DeclRefExpr *enumConstantRef = Result.Nodes.getNodeAs("cudaEnumConstantRef")) { - StringRef name = enumConstantRef->getDecl()->getName(); - SourceLocation sl = enumConstantRef->getLocStart(); - SourceManager *SM = Result.SourceManager; - - // TODO: Make a lookup table just for enum values to improve performance. - const auto found = CUDA_IDENTIFIER_MAP.find(name); - if (found != CUDA_IDENTIFIER_MAP.end()) { - Statistics::current().incrementCounter(found->second, name.str()); - if (!found->second.unsupported) { - StringRef repName = found->second.hipName; - Replacement Rep(*SM, sl, name.size(), repName); - FullSourceLoc fullSL(sl, *SM); - insertReplacement(Rep, fullSL); - } - } else { - std::string msg = "the following reference is not handled: '" + name.str() + "' [enum constant ref]."; - printHipifyMessage(*SM, sl, msg); - } - return true; - } - return false; - } - - bool cudaType(const MatchFinder::MatchResult& Result) { - const clang::TypeLoc* ret = Result.Nodes.getNodeAs("cudaType"); - if (!ret) { - return false; - } - - // Ignore qualifiers - they don't alter our decision to rename. - clang::UnqualTypeLoc tl = ret->getUnqualifiedLoc(); - const Type& typeObject = *(tl.getTypePtr()); - - std::string typeName = tl.getType().getAsString(); - - // Irritatingly, enum/struct types are identified as `enum/struct `, and unlike most compound - // types (such as pointers or references), there isn't another type node inside. So we have - // to make do with what we've got. There's probably a better way of doing this... - if (typeObject.isEnumeralType()) { - removePrefixIfPresent(typeName, "enum "); - } - if (typeObject.isStructureType()) { - removePrefixIfPresent(typeName, "struct "); - } - - // Do we have a replacement for this type? - const auto found = CUDA_TYPE_NAME_MAP.find(typeName); - if (found == CUDA_TYPE_NAME_MAP.end()) { - return false; - } - - SourceManager &SM = *(Result.SourceManager); - - // Start of the type expression to replace. - SourceLocation sl = tl.getBeginLoc(); - - const hipCounter& hipCtr = found->second; - if (hipCtr.unsupported) { - printHipifyMessage(SM, sl, "Unsupported CUDA '" + typeName); - return false; - } - - // Apply the rename! - Replacement Rep(SM, sl, typeName.size(), hipCtr.hipName); - FullSourceLoc fullSL(sl, SM); - insertReplacement(Rep, fullSL); - - return true; - } - - bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) { - StringRef refName = "cudaSharedIncompleteArrayVar"; - if (const VarDecl *sharedVar = Result.Nodes.getNodeAs(refName)) { - // Example: extern __shared__ uint sRadix1[]; - if (sharedVar->hasExternalFormalLinkage()) { - QualType QT = sharedVar->getType(); - std::string typeName; - if (QT->isIncompleteArrayType()) { - const ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); - QT = AT->getElementType(); - if (QT.getTypePtr()->isBuiltinType()) { - QT = QT.getCanonicalType(); - const BuiltinType *BT = dyn_cast(QT); - if (BT) { - LangOptions LO; - LO.CUDA = true; - PrintingPolicy policy(LO); - typeName = BT->getName(policy); - } - } else { - typeName = QT.getAsString(); - } - } - if (!typeName.empty()) { - SourceLocation slStart = sharedVar->getLocStart(); - SourceLocation slEnd = sharedVar->getLocEnd(); - SourceManager *SM = Result.SourceManager; - size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1; - std::string varName = sharedVar->getNameAsString(); - std::string repName = "HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")"; - Replacement Rep(*SM, slStart, repLength, repName); - FullSourceLoc fullSL(slStart, *SM); - insertReplacement(Rep, fullSL); - hipCounter counter = { "HIP_DYNAMIC_SHARED", ConvTypes::CONV_MEM, ApiTypes::API_RUNTIME }; - Statistics::current().incrementCounter(counter, refName.str()); - } - } - return true; - } - return false; - } - - bool stringLiteral(const MatchFinder::MatchResult &Result) { - if (const clang::StringLiteral *sLiteral = Result.Nodes.getNodeAs("stringLiteral")) { - if (sLiteral->getCharByteWidth() == 1) { - StringRef s = sLiteral->getString(); - SourceManager *SM = Result.SourceManager; - processString(s, *SM, sLiteral->getLocStart()); - } - return true; - } - return false; - } - -public: - Cuda2HipCallback(Replacements& Replace, ast_matchers::MatchFinder *parent, HipifyPPCallbacks *PPCallbacks, const std::string &mainFileName) - : Cuda2Hip(Replace, mainFileName), owner(parent), PP(PPCallbacks) { - PP->setMatch(this); - } - - void run(const MatchFinder::MatchResult &Result) override { - if (cudaType(Result)) return; - if (cudaCall(Result)) return; - if (cudaBuiltin(Result)) return; - if (cudaEnumConstantRef(Result)) return; - if (cudaLaunchKernel(Result)) return; - if (cudaSharedIncompleteArrayVar(Result)) return; - if (stringLiteral(Result)) return; - } - -private: - ast_matchers::MatchFinder *owner; - HipifyPPCallbacks *PP; -}; - -void HipifyPPCallbacks::handleEndSource() { - insertHipHeaders(Match, *_sm); -} - -void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callback) { - // Rewrite CUDA api calls to hip ones. - Finder.addMatcher( - callExpr( - isExpansionInMainFile(), - callee( - functionDecl( - matchesName("cu.*"), - unless( - // Clang generates structs with functions on them to represent things like - // threadIdx.x. We have other logic to handle those builtins directly, so - // we need to suppress the call-handling. - // We can't handle those directly in the call-handler without special-casing - // it unpleasantly, since the names of the functions are unique only per-struct. - matchesName("__fetch_builtin.*") - ) - ) - ) - ).bind("cudaCall"), - Callback - ); - - // Rewrite all references to CUDA types to their corresponding hip types. - Finder.addMatcher( - typeLoc( - isExpansionInMainFile() - ).bind("cudaType"), - Callback - ); - - // Replace references to CUDA names in string literals with the equivalent hip names. - Finder.addMatcher(stringLiteral(isExpansionInMainFile()).bind("stringLiteral"), Callback); - - // Replace the <<<...>>> language extension with a hip kernel launch - Finder.addMatcher(cudaKernelCallExpr(isExpansionInMainFile()).bind("cudaLaunchKernel"), Callback); - - // Replace cuda builtins. - Finder.addMatcher( - memberExpr( - isExpansionInMainFile(), - hasObjectExpression( - hasType( - cxxRecordDecl( - matchesName("__cuda_builtin_") - ) - ) - ) - ).bind("cudaBuiltin"), - Callback - ); - - // Map CUDA enum _values_ to their hip equivalents. - Finder.addMatcher( - declRefExpr( - isExpansionInMainFile(), - to( - enumConstantDecl( - matchesName("cu.*|CU.*") - ) - ) - ).bind("cudaEnumConstantRef"), - Callback - ); - - Finder.addMatcher( - varDecl( - isExpansionInMainFile(), - allOf( - hasAttr(attr::CUDAShared), - hasType(incompleteArrayType()) - ) - ).bind("cudaSharedIncompleteArrayVar"), - Callback - ); -} - -void copyFile(const std::string& src, const std::string& dst) { - std::ifstream source(src, std::ios::binary); - std::ofstream dest(dst, std::ios::binary); - dest << source.rdbuf(); -} - -int main(int argc, const char **argv) { - llcompat::PrintStackTraceOnErrorSignal(); - - CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore); - std::vector fileSources = OptionsParser.getSourcePathList(); - std::string dst = OutputFilename; - if (!dst.empty() && fileSources.size() > 1) { - llvm::errs() << "[HIPIFY] conflict: -o and multiple source files are specified.\n"; - return 1; - } - - if (NoOutput) { - if (Inplace) { - llvm::errs() << "[HIPIFY] conflict: both -no-output and -inplace options are specified.\n"; - return 1; - } - if (!dst.empty()) { - llvm::errs() << "[HIPIFY] conflict: both -no-output and -o options are specified.\n"; - return 1; - } - } - - if (Examine) { - NoOutput = PrintStats = true; - } - - int Result = 0; - - // Arguments for the Statistics print routines. - std::unique_ptr csv = nullptr; - llvm::raw_ostream* statPrint = nullptr; - if (!OutputStatsFilename.empty()) { - csv = std::unique_ptr(new std::ofstream(OutputStatsFilename, std::ios_base::trunc)); - } - if (PrintStats) { - statPrint = &llvm::errs(); - } - - for (const auto & src : fileSources) { - if (dst.empty()) { - if (Inplace) { - dst = src; - } else { - dst = src + ".hip"; - } - } else if (Inplace) { - llvm::errs() << "[HIPIFY] conflict: both -o and -inplace options are specified.\n"; - return 1; - } - - std::string tmpFile = src + ".hipify-tmp"; - - // Create a copy of the file to work on. When we're done, we'll move this onto the - // output (which may mean overwriting the input, if we're in-place). - // Should we fail for some reason, we'll just leak this file and not corrupt the input. - copyFile(src, tmpFile); - - // Initialise the statistics counters for this file. - Statistics::setActive(src); - - // RefactoringTool operates on the file in-place. Giving it the output path is no good, - // because that'll break relative includes, and we don't want to overwrite the input file. - // So what we do is operate on a copy, which we then move to the output. - RefactoringTool Tool(OptionsParser.getCompilations(), tmpFile); - ast_matchers::MatchFinder Finder; - - // The Replacements to apply to the file `src`. - Replacements& replacementsToUse = llcompat::getReplacements(Tool, tmpFile); - HipifyPPCallbacks* PPCallbacks = new HipifyPPCallbacks(replacementsToUse, tmpFile); - Cuda2HipCallback Callback(replacementsToUse, &Finder, PPCallbacks, tmpFile); - - addAllMatchers(Finder, &Callback); - - auto action = newFrontendActionFactory(&Finder, PPCallbacks); - - Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("--cuda-host-only", ArgumentInsertPosition::BEGIN)); - - // Ensure at least c++11 is used. - Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-std=c++11", ArgumentInsertPosition::BEGIN)); -#if defined(HIPIFY_CLANG_RES) - Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES)); -#endif - Tool.appendArgumentsAdjuster(getClangSyntaxOnlyAdjuster()); - Result += Tool.run(action.get()); - Tool.clearArgumentsAdjusters(); - - LangOptions DefaultLangOptions; - IntrusiveRefCntPtr DiagOpts = new DiagnosticOptions(); - TextDiagnosticPrinter DiagnosticPrinter(llvm::errs(), &*DiagOpts); - DiagnosticsEngine Diagnostics(IntrusiveRefCntPtr(new DiagnosticIDs()), &*DiagOpts, &DiagnosticPrinter, false); - - SourceManager SM(Diagnostics, Tool.getFiles()); - - Rewriter Rewrite(SM, DefaultLangOptions); - if (!Tool.applyAllReplacements(Rewrite)) { - DEBUG(dbgs() << "Skipped some replacements.\n"); - } - - // Either move the tmpfile to the output, or remove it. - if (!NoOutput) { - Result += Rewrite.overwriteChangedFiles(); - rename(tmpFile.c_str(), dst.c_str()); - } else { - remove(tmpFile.c_str()); - } - - Statistics::current().markCompletion(); - Statistics::current().print(csv.get(), statPrint); - - dst.clear(); - } - - if (fileSources.size() > 1) { - Statistics::printAggregate(csv.get(), statPrint); - } - - return Result; -} diff --git a/hipamd/hipify-clang/src/HipifyAction.cpp b/hipamd/hipify-clang/src/HipifyAction.cpp new file mode 100644 index 0000000000..192dd00949 --- /dev/null +++ b/hipamd/hipify-clang/src/HipifyAction.cpp @@ -0,0 +1,460 @@ +#include "HipifyAction.h" + +#include + +#include "clang/Basic/SourceLocation.h" +#include "clang/Frontend/CompilerInstance.h" +#include "clang/ASTMatchers/ASTMatchFinder.h" +#include "clang/ASTMatchers/ASTMatchers.h" + +#include "LLVMCompat.h" +#include "CUDA2HipMap.h" +#include "StringUtils.h" +#include "ArgParse.h" + +namespace ct = clang::tooling; +namespace mat = clang::ast_matchers; + +void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) { + clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + + size_t begin = 0; + while ((begin = s.find("cu", begin)) != StringRef::npos) { + const size_t end = s.find_first_of(" ", begin + 4); + StringRef name = s.slice(begin, end); + const auto found = CUDA_RENAMES_MAP().find(name); + if (found != CUDA_RENAMES_MAP().end()) { + StringRef repName = found->second.hipName; + hipCounter counter = {"[string literal]", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.unsupported}; + Statistics::current().incrementCounter(counter, name.str()); + + if (!counter.unsupported) { + clang::SourceLocation sl = start.getLocWithOffset(begin + 1); + ct::Replacement Rep(SM, sl, name.size(), repName); + clang::FullSourceLoc fullSL(sl, SM); + insertReplacement(Rep, fullSL); + } + } + + if (end == StringRef::npos) { + break; + } + + begin = end + 1; + } +} + +/** + * Look at, and consider altering, a given token. + * + * If it's not a CUDA identifier, nothing happens. + * If it's an unsupported CUDA identifier, a warning is emitted. + * Otherwise, the source file is updated with the corresponding hipification. + */ +void HipifyAction::RewriteToken(const clang::Token& t) { + clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + + // String literals containing CUDA references need fixing... + if (t.is(clang::tok::string_literal)) { + StringRef s(t.getLiteralData(), t.getLength()); + RewriteString(unquoteStr(s), t.getLocation()); + return; + } else if (!t.isAnyIdentifier()) { + // If it's neither a string nor an identifier, we don't care. + return; + } + + StringRef name = t.getRawIdentifier(); + const auto found = CUDA_RENAMES_MAP().find(name); + if (found == CUDA_RENAMES_MAP().end()) { + // So it's an identifier, but not CUDA? Boring. + return; + } + + Statistics::current().incrementCounter(found->second, name.str()); + + clang::SourceLocation sl = t.getLocation(); + if (found->second.unsupported) { + // An unsupported identifier? Curses! Warn the user. + clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics(); + const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier unsupported in hip"); + DE.Report(sl, ID); + return; + } + + StringRef repName = found->second.hipName; + ct::Replacement Rep(SM, sl, name.size(), repName); + clang::FullSourceLoc fullSL(sl, SM); + insertReplacement(Rep, fullSL); +} + +namespace { + +clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) { + clang::SourceLocation begin = exprRange.getBegin(); + clang::SourceLocation end = exprRange.getEnd(); + + bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{}); + bool endSafe = !SM.isMacroBodyExpansion(end) || clang::Lexer::isAtEndOfMacroExpansion(end, SM, clang::LangOptions{}); + + if (beginSafe && endSafe) { + return {SM.getFileLoc(begin), SM.getFileLoc(end)}; + } else { + return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)}; + } +} + + +clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) { + clang::SourceLocation begin = exprRange.getBegin(); + clang::SourceLocation end = exprRange.getEnd(); + + // If the range is contained within a macro, update the macro definition. + // Otherwise, use the file location and hope for the best. + if (!SM.isMacroBodyExpansion(begin) || !SM.isMacroBodyExpansion(end)) { + return {SM.getFileLoc(begin), SM.getFileLoc(end)}; + } + + return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)}; +} + + +StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) { + return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr); +} + +/** + * Get a string representation of the expression `arg`, unless it's a defaulting function + * call argument, in which case get a 0. Used for building argument lists to kernel calls. + */ +std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Expr* arg) { + if (clang::isa(arg)) { + return "0"; + } else { + return readSourceText(SM, arg->getSourceRange()); + } +} + +} // anonymous namespace + + +void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc, + const clang::Token&, + StringRef file_name, + bool is_angled, + clang::CharSourceRange filename_range, + const clang::FileEntry*, StringRef, + StringRef, const clang::Module*) { + clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + if (!SM.isWrittenInMainFile(hash_loc)) { + return; + } + + const auto found = CUDA_INCLUDE_MAP.find(file_name); + if (found == CUDA_INCLUDE_MAP.end()) { + // Not a CUDA include - don't touch it. + return; + } + + // Special-casing to avoid duplication of the hip_runtime include. + if (found->second.hipName == "hip/hip_runtime.h") { + if (insertedRuntimeHeader) { + return; + } + + insertedRuntimeHeader = true; + } + + Statistics::current().incrementCounter(found->second, file_name.str()); + + clang::SourceLocation sl = filename_range.getBegin(); + if (found->second.unsupported) { + // An unsupported CUDA header? Oh dear. Print a warning. + clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics(); + DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header")); + return; + } + + const char *B = SM.getCharacterData(sl); + const char *E = SM.getCharacterData(filename_range.getEnd()); + clang::SmallString<128> includeBuffer; + clang::StringRef newInclude; + + // Keep the same include type that the user gave. + if (is_angled) { + newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer); + } else { + newInclude = llvm::Twine("\"" + found->second.hipName + "\"").toStringRef(includeBuffer); + } + + ct::Replacement Rep(SM, sl, E - B, newInclude); + insertReplacement(Rep, clang::FullSourceLoc{sl, SM}); +} + + +bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) { + StringRef refName = "cudaLaunchKernel"; + + const auto* launchKernel = Result.Nodes.getNodeAs(refName); + if (!launchKernel) { + return false; + } + + clang::SmallString<40> XStr; + llvm::raw_svector_ostream OS(XStr); + + clang::LangOptions DefaultLangOptions; + clang::SourceManager* SM = Result.SourceManager; + + const clang::Expr& calleeExpr = *(launchKernel->getCallee()); + OS << "hipLaunchKernelGGL(" << readSourceText(*SM, calleeExpr.getSourceRange()) << ", "; + + // Next up are the four kernel configuration parameters, the last two of which are optional and default to zero. + const clang::CallExpr& config = *(launchKernel->getConfig()); + + // Copy the two dimensional arguments verbatim. + OS << "dim3(" << readSourceText(*SM, config.getArg(0)->getSourceRange()) << "), "; + OS << "dim3(" << readSourceText(*SM, config.getArg(1)->getSourceRange()) << "), "; + + // The stream/memory arguments default to zero if omitted. + OS << stringifyZeroDefaultedArg(*SM, config.getArg(2)) << ", "; + OS << stringifyZeroDefaultedArg(*SM, config.getArg(3)); + + // If there are ordinary arguments to the kernel, just copy them verbatim into our new call. + int numArgs = launchKernel->getNumArgs(); + if (numArgs > 0) { + OS << ", "; + + // Start of the first argument. + clang::SourceLocation argStart = launchKernel->getArg(0)->getLocStart(); + + // End of the last argument. + clang::SourceLocation argEnd = launchKernel->getArg(numArgs - 1)->getLocEnd(); + + OS << readSourceText(*SM, {argStart, argEnd}); + } + + OS << ")"; + + clang::SourceRange replacementRange = getWriteRange(*SM, {launchKernel->getLocStart(), launchKernel->getLocEnd()}); + clang::SourceLocation launchStart = replacementRange.getBegin(); + clang::SourceLocation launchEnd = replacementRange.getEnd(); + + size_t length = SM->getCharacterData(clang::Lexer::getLocForEndOfToken(launchEnd, 0, *SM, DefaultLangOptions)) - SM->getCharacterData(launchStart); + + ct::Replacement Rep(*SM, launchStart, length, OS.str()); + clang::FullSourceLoc fullSL(launchStart, *SM); + insertReplacement(Rep, fullSL); + hipCounter counter = {"hipLaunchKernelGGL", ConvTypes::CONV_KERN, ApiTypes::API_RUNTIME}; + Statistics::current().incrementCounter(counter, refName.str()); + + return true; +} + +bool HipifyAction::cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result) { + const clang::MemberExpr* threadIdx = Result.Nodes.getNodeAs("cudaBuiltin"); + if (!threadIdx) { + return false; + } + + const clang::OpaqueValueExpr* refBase = clang::dyn_cast(threadIdx->getBase()); + if (!refBase) { + return false; + } + + const clang::DeclRefExpr* declRef = clang::dyn_cast(refBase->getSourceExpr()); + if (!declRef) { + return false; + } + + clang::SourceLocation sl = threadIdx->getLocStart(); + clang::SourceManager* SM = Result.SourceManager; + StringRef name = declRef->getDecl()->getName(); + StringRef memberName = threadIdx->getMemberDecl()->getName(); + size_t pos = memberName.find_first_not_of("__fetch_builtin_"); + memberName = memberName.slice(pos, memberName.size()); + clang::SmallString<128> tmpData; + name = clang::Twine(name + "." + memberName).toStringRef(tmpData); + + const auto found = CUDA_IDENTIFIER_MAP.find(name); + if (found != CUDA_IDENTIFIER_MAP.end()) { + Statistics::current().incrementCounter(found->second, name.str()); + if (!found->second.unsupported) { + StringRef repName = found->second.hipName; + ct::Replacement Rep(*SM, sl, name.size(), repName); + clang::FullSourceLoc fullSL(sl, *SM); + insertReplacement(Rep, fullSL); + } + } else { + clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics(); + const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unknown CUDA builtin"); + DE.Report(sl, ID); + } + + return true; +} + +bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) { + StringRef refName = "cudaSharedIncompleteArrayVar"; + auto* sharedVar = Result.Nodes.getNodeAs(refName); + if (!sharedVar) { + return false; + } + + // Example: extern __shared__ uint sRadix1[]; + if (!sharedVar->hasExternalFormalLinkage()) { + return false; + } + + clang::QualType QT = sharedVar->getType(); + std::string typeName; + if (QT->isIncompleteArrayType()) { + const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe(); + QT = AT->getElementType(); + if (QT.getTypePtr()->isBuiltinType()) { + QT = QT.getCanonicalType(); + const auto* BT = clang::dyn_cast(QT); + if (BT) { + clang::LangOptions LO; + LO.CUDA = true; + clang::PrintingPolicy policy(LO); + typeName = BT->getName(policy); + } + } else { + typeName = QT.getAsString(); + } + } + + if (!typeName.empty()) { + clang::SourceLocation slStart = sharedVar->getLocStart(); + clang::SourceLocation slEnd = sharedVar->getLocEnd(); + clang::SourceManager* SM = Result.SourceManager; + size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1; + std::string varName = sharedVar->getNameAsString(); + std::string repName = "HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")"; + ct::Replacement Rep(*SM, slStart, repLength, repName); + clang::FullSourceLoc fullSL(slStart, *SM); + insertReplacement(Rep, fullSL); + hipCounter counter = {"HIP_DYNAMIC_SHARED", ConvTypes::CONV_MEM, ApiTypes::API_RUNTIME}; + Statistics::current().incrementCounter(counter, refName.str()); + } + + return true; +} + +void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) { + llcompat::insertReplacement(*replacements, rep); + if (PrintStats) { + rep.getLength(); + Statistics::current().lineTouched(fullSL.getExpansionLineNumber()); + Statistics::current().bytesChanged(rep.getLength()); + } +} + +std::unique_ptr HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) { + Finder.reset(new clang::ast_matchers::MatchFinder); + + // Replace the <<<...>>> language extension with a hip kernel launch + Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this); + + // Replace cuda builtins. + Finder->addMatcher( + mat::memberExpr( + mat::isExpansionInMainFile(), + mat::hasObjectExpression( + mat::hasType( + mat::cxxRecordDecl( + mat::matchesName("__cuda_builtin_") + ) + ) + ) + ).bind("cudaBuiltin"), + this + ); + + Finder->addMatcher( + mat::varDecl( + mat::isExpansionInMainFile(), + mat::allOf( + mat::hasAttr(clang::attr::CUDAShared), + mat::hasType(mat::incompleteArrayType()) + ) + ).bind("cudaSharedIncompleteArrayVar"), + this + ); + + // Ownership is transferred to the caller... + return Finder->newASTConsumer(); +} + +void HipifyAction::EndSourceFileAction() { + // Insert the hip header, if we didn't already do it by accident during substitution. + if (!insertedRuntimeHeader) { + // It's not sufficient to just replace CUDA headers with hip ones, because numerous CUDA headers are + // implicitly included by the compiler. Instead, we _delete_ CUDA headers, and unconditionally insert + // one copy of the hip include into every file. + clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + + clang::SourceLocation sl = SM.getLocForStartOfFile(SM.getMainFileID()); + clang::FullSourceLoc fullSL(sl, SM); + ct::Replacement Rep(SM, sl, 0, "#include \n"); + insertReplacement(Rep, fullSL); + } + + clang::ASTFrontendAction::EndSourceFileAction(); +} + + +namespace { + +/** + * A silly little class to proxy PPCallbacks back to the HipifyAction class. + */ +class PPCallbackProxy : public clang::PPCallbacks { + HipifyAction& hipifyAction; + +public: + explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {} + + void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token, + StringRef file_name, bool is_angled, clang::CharSourceRange filename_range, + const clang::FileEntry* file, StringRef search_path, StringRef relative_path, + const clang::Module* imported) override { + hipifyAction.InclusionDirective(hash_loc, include_token, file_name, is_angled, filename_range, file, search_path, relative_path, imported); + } +}; + +} + +void HipifyAction::ExecuteAction() { + clang::Preprocessor& PP = getCompilerInstance().getPreprocessor(); + clang::SourceManager& SM = getCompilerInstance().getSourceManager(); + + // Start lexing the specified input file. + const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID()); + clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts()); + RawLex.SetKeepWhitespaceMode(true); + + // Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough + // information to tell the difference between identifiers, string literals, and "other stuff". It also + // ignores preprocessor directives, so this transformation will operate inside preprocessor-deleted + // code. + clang::Token RawTok; + RawLex.LexFromRawLexer(RawTok); + while (RawTok.isNot(clang::tok::eof)) { + RewriteToken(RawTok); + RawLex.LexFromRawLexer(RawTok); + } + + // Register yourself as the preprocessor callback, by proxy. + PP.addPPCallbacks(std::unique_ptr(new PPCallbackProxy(*this))); + + // Now we're done futzing with the lexer, have the subclass proceeed with Sema and AST matching. + clang::ASTFrontendAction::ExecuteAction(); +} + +void HipifyAction::run(const clang::ast_matchers::MatchFinder::MatchResult& Result) { + if (cudaBuiltin(Result)) return; + if (cudaLaunchKernel(Result)) return; + if (cudaSharedIncompleteArrayVar(Result)) return; +} diff --git a/hipamd/hipify-clang/src/HipifyAction.h b/hipamd/hipify-clang/src/HipifyAction.h new file mode 100644 index 0000000000..03d34601f3 --- /dev/null +++ b/hipamd/hipify-clang/src/HipifyAction.h @@ -0,0 +1,83 @@ +#pragma once + +#include "clang/Lex/PPCallbacks.h" +#include "clang/Tooling/Tooling.h" +#include "clang/Frontend/FrontendAction.h" +#include "clang/Tooling/Core/Replacement.h" +#include "clang/ASTMatchers/ASTMatchFinder.h" +#include "ReplacementsFrontendActionFactory.h" + +namespace ct = clang::tooling; + +/** + * A FrontendAction that hipifies CUDA programs. + */ +class HipifyAction : public clang::ASTFrontendAction, + public clang::ast_matchers::MatchFinder::MatchCallback { +private: + ct::Replacements* replacements; + std::unique_ptr Finder; + + /// CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent + // ones, and track - using this flag - if the result led to us including the hip runtime header. If it did + // not, we insert it at the top of the file when we finish processing it. + // This approach means we do the best it's possible to do w.r.t preserving the user's include order. + bool insertedRuntimeHeader = false; + + /** + * Rewrite a string literal to refer to hip, not CUDA. + */ + void RewriteString(StringRef s, clang::SourceLocation start); + + /** + * Replace a CUDA identifier with the corresponding hip identifier, if applicable. + */ + void RewriteToken(const clang::Token &t); + +public: + explicit HipifyAction(ct::Replacements *replacements): + clang::ASTFrontendAction(), + replacements(replacements) {} + + // MatchCallback listeners + bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result); + bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result); + bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result); + + /** + * Called by the preprocessor for each include directive during the non-raw lexing pass. + */ + void InclusionDirective(clang::SourceLocation hash_loc, + const clang::Token &include_token, + StringRef file_name, + bool is_angled, + clang::CharSourceRange filename_range, + const clang::FileEntry *file, + StringRef search_path, + StringRef relative_path, + const clang::Module *imported); + +protected: + /** + * Add a Replacement for the current file. These will all be applied after executing the FrontendAction. + */ + void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL); + + /** + * FrontendAction entry point. + */ + void ExecuteAction() override; + + /** + * Called at the start of each new file to process. + */ + void EndSourceFileAction() override; + + /** + * MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an + * interest for. + */ + void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override; + + std::unique_ptr CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override; +}; diff --git a/hipamd/hipify-clang/src/ReplacementsFrontendActionFactory.h b/hipamd/hipify-clang/src/ReplacementsFrontendActionFactory.h new file mode 100644 index 0000000000..7896635ef6 --- /dev/null +++ b/hipamd/hipify-clang/src/ReplacementsFrontendActionFactory.h @@ -0,0 +1,28 @@ +#pragma once + +#include "clang/Tooling/Tooling.h" +#include "clang/Frontend/FrontendAction.h" +#include "clang/Tooling/Core/Replacement.h" + +namespace ct = clang::tooling; + + +/** + * A FrontendActionFactory that propagates a set of Replacements into the FrontendAction. + * This is necessary boilerplate for using a custom FrontendAction with a RefactoringTool. + * + * @tparam T The FrontendAction to create. + */ +template +class ReplacementsFrontendActionFactory : public ct::FrontendActionFactory { + ct::Replacements* replacements; + +public: + explicit ReplacementsFrontendActionFactory(ct::Replacements* r): + ct::FrontendActionFactory(), + replacements(r) {} + + clang::FrontendAction* create() override { + return new T(replacements); + } +}; diff --git a/hipamd/hipify-clang/src/main.cpp b/hipamd/hipify-clang/src/main.cpp new file mode 100644 index 0000000000..0cc3594466 --- /dev/null +++ b/hipamd/hipify-clang/src/main.cpp @@ -0,0 +1,155 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/** + * @file Cuda2Hip.cpp + * + * This file is compiled and linked into clang based hipify tool. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "CUDA2HipMap.h" +#include "LLVMCompat.h" +#include "HipifyAction.h" +#include "ArgParse.h" + +#define DEBUG_TYPE "cuda2hip" + +namespace ct = clang::tooling; + + +namespace { + +void copyFile(const std::string& src, const std::string& dst) { + std::ifstream source(src, std::ios::binary); + std::ofstream dest(dst, std::ios::binary); + dest << source.rdbuf(); +} + +} // anonymous namespace + +int main(int argc, const char **argv) { + llcompat::PrintStackTraceOnErrorSignal(); + + ct::CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore); + std::vector fileSources = OptionsParser.getSourcePathList(); + std::string dst = OutputFilename; + if (!dst.empty() && fileSources.size() > 1) { + llvm::errs() << "[HIPIFY] conflict: -o and multiple source files are specified.\n"; + return 1; + } + + if (NoOutput) { + if (Inplace) { + llvm::errs() << "[HIPIFY] conflict: both -no-output and -inplace options are specified.\n"; + return 1; + } + if (!dst.empty()) { + llvm::errs() << "[HIPIFY] conflict: both -no-output and -o options are specified.\n"; + return 1; + } + } + + if (Examine) { + NoOutput = PrintStats = true; + } + + int Result = 0; + + // Arguments for the Statistics print routines. + std::unique_ptr csv = nullptr; + llvm::raw_ostream* statPrint = nullptr; + if (!OutputStatsFilename.empty()) { + csv = std::unique_ptr(new std::ofstream(OutputStatsFilename, std::ios_base::trunc)); + } + if (PrintStats) { + statPrint = &llvm::errs(); + } + + for (const auto & src : fileSources) { + if (dst.empty()) { + if (Inplace) { + dst = src; + } else { + dst = src + ".hip"; + } + } else if (Inplace) { + llvm::errs() << "[HIPIFY] conflict: both -o and -inplace options are specified.\n"; + return 1; + } + + std::string tmpFile = src + ".hipify-tmp"; + + // Create a copy of the file to work on. When we're done, we'll move this onto the + // output (which may mean overwriting the input, if we're in-place). + // Should we fail for some reason, we'll just leak this file and not corrupt the input. + copyFile(src, tmpFile); + + // Initialise the statistics counters for this file. + Statistics::setActive(src); + + // RefactoringTool operates on the file in-place. Giving it the output path is no good, + // because that'll break relative includes, and we don't want to overwrite the input file. + // So what we do is operate on a copy, which we then move to the output. + ct::RefactoringTool Tool(OptionsParser.getCompilations(), tmpFile); + ct::Replacements& replacementsToUse = llcompat::getReplacements(Tool, tmpFile); + + ReplacementsFrontendActionFactory actionFactory(&replacementsToUse); + + Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN)); + + // Ensure at least c++11 is used. + Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-std=c++11", ct::ArgumentInsertPosition::BEGIN)); +#if defined(HIPIFY_CLANG_RES) + Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES)); +#endif + Tool.appendArgumentsAdjuster(ct::getClangSyntaxOnlyAdjuster()); + + // Hipify _all_ the things! + if (Tool.runAndSave(&actionFactory)) { + DEBUG(llvm::dbgs() << "Skipped some replacements.\n"); + } + + // Either move the tmpfile to the output, or remove it. + if (!NoOutput) { + rename(tmpFile.c_str(), dst.c_str()); + } else { + remove(tmpFile.c_str()); + } + + Statistics::current().markCompletion(); + Statistics::current().print(csv.get(), statPrint); + + dst.clear(); + } + + if (fileSources.size() > 1) { + Statistics::printAggregate(csv.get(), statPrint); + } + + return Result; +} diff --git a/hipamd/include/hip/hcc_detail/hip_runtime.h b/hipamd/include/hip/hcc_detail/hip_runtime.h index 379fc05f5b..924e774af0 100644 --- a/hipamd/include/hip/hcc_detail/hip_runtime.h +++ b/hipamd/include/hip/hcc_detail/hip_runtime.h @@ -50,10 +50,16 @@ THE SOFTWARE. #include +// define HIP_ENABLE_PRINTF to enable printf +#ifdef HIP_ENABLE_PRINTF + #define HCC_ENABLE_ACCELERATOR_PRINTF 1 +#endif + //--- // Remainder of this file only compiles with HCC #if defined __HCC__ #include +#include "hc_printf.hpp" //TODO-HCC-GL - change this to typedef. //typedef grid_launch_parm hipLaunchParm ; @@ -108,13 +114,12 @@ extern int HIP_TRACE_API; #if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0) // Device compile and not host compile: -//TODO-HCC enable __HIP_ARCH_HAS_ATOMICS__ when HCC supports these. // 32-bit Atomics: #define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1) #define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1) #define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1) #define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1) -#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0) +#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1) // 64-bit Atomics: #define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1) @@ -376,6 +381,27 @@ __device__ void __threadfence_system(void) ; * @} */ +template::type f> +class Coordinates { + using R = decltype(f(0)); + + struct X { __device__ operator R() const { return f(0); } }; + struct Y { __device__ operator R() const { return f(1); } }; + struct Z { __device__ operator R() const { return f(2); } }; +public: + static constexpr X x{}; + static constexpr Y y{}; + static constexpr Z z{}; +}; + +static constexpr Coordinates blockDim; +static constexpr Coordinates blockIdx; +static constexpr Coordinates gridDim; +static constexpr Coordinates threadIdx; #define hipThreadIdx_x (hc_get_workitem_id(0)) #define hipThreadIdx_y (hc_get_workitem_id(1)) @@ -420,6 +446,20 @@ static inline __device__ void* memset(void* ptr, int val, size_t size) } +#ifdef __HCC_ACCELERATOR__ + +#ifdef HC_FEATURE_PRINTF +template +static inline __device__ void printf(const char* format, All... all) { + hc::printf(format, all...); +} +#else +template +static inline __device__ void printf(const char* format, All... all) { } +#endif + +#endif + #define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE) @@ -462,7 +502,7 @@ do {\ type* var = \ (type*)__get_dynamicgroupbaseptr(); \ -#define HIP_DYNAMIC_SHARED_ATTRIBUTE +#define HIP_DYNAMIC_SHARED_ATTRIBUTE diff --git a/hipamd/packaging/hip_base.txt b/hipamd/packaging/hip_base.txt index 836a82657b..f77e9ba3e8 100644 --- a/hipamd/packaging/hip_base.txt +++ b/hipamd/packaging/hip_base.txt @@ -28,7 +28,7 @@ set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0)") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") +set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") diff --git a/hipamd/packaging/hip_doc.txt b/hipamd/packaging/hip_doc.txt index 6f602c84cf..daef7810b4 100644 --- a/hipamd/packaging/hip_doc.txt +++ b/hipamd/packaging/hip_doc.txt @@ -33,7 +33,7 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION})") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") +set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/hipamd/packaging/hip_hcc.txt b/hipamd/packaging/hip_hcc.txt index 284d97e2e5..7c1736eee5 100644 --- a/hipamd/packaging/hip_hcc.txt +++ b/hipamd/packaging/hip_hcc.txt @@ -37,14 +37,14 @@ else() set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@)") endif() set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") +set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") if(@COMPILE_HIP_ATP_MARKER@) - set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler, libstdc++-static") + set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler") else() - set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, libstdc++-static") + set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@") endif() set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") set(CPACK_SOURCE_GENERATOR "TGZ") diff --git a/hipamd/packaging/hip_nvcc.txt b/hipamd/packaging/hip_nvcc.txt index 0d7c357623..254b7a956a 100644 --- a/hipamd/packaging/hip_nvcc.txt +++ b/hipamd/packaging/hip_nvcc.txt @@ -20,7 +20,7 @@ set(CPACK_BINARY_DEB "ON") #set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm") set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") +set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") #set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst") #set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") diff --git a/hipamd/packaging/hip_samples.txt b/hipamd/packaging/hip_samples.txt index 6d34a6fd40..c1707e42f1 100644 --- a/hipamd/packaging/hip_samples.txt +++ b/hipamd/packaging/hip_samples.txt @@ -21,7 +21,7 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM") set(CPACK_BINARY_DEB "ON") set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION})") set(CPACK_BINARY_RPM "ON") -set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64") +set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}") set(CPACK_RPM_PACKAGE_AUTOREQPROV " no") set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt") diff --git a/hipamd/samples/0_Intro/module_api/Makefile b/hipamd/samples/0_Intro/module_api/Makefile index 38bd00a6a6..270d4c1211 100644 --- a/hipamd/samples/0_Intro/module_api/Makefile +++ b/hipamd/samples/0_Intro/module_api/Makefile @@ -5,7 +5,7 @@ endif HIPCC=$(HIP_PATH)/bin/hipcc HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler) -all: vcpy_kernel.code runKernel.hip.out defaultDriver.hip.out +all: vcpy_kernel.code runKernel.hip.out launchKernelHcc.hip.out runKernel.hip.out: runKernel.cpp $(HIPCC) $(HIPCC_FLAGS) $< -o $@ @@ -13,8 +13,8 @@ runKernel.hip.out: runKernel.cpp launchKernelHcc.hip.out: launchKernelHcc.cpp $(HIPCC) $(HIPCC_FLAGS) $< -o $@ -defaultDriver.hip.out: defaultDriver.cpp - $(HIPCC) $(HIPCC_FLAGS) $< -o $@ +#defaultDriver.hip.out: defaultDriver.cpp +# $(HIPCC) $(HIPCC_FLAGS) $< -o $@ vcpy_kernel.code: vcpy_kernel.cpp $(HIPCC) --genco $(GENCO_FLAGS) $^ -o $@ diff --git a/hipamd/samples/0_Intro/module_api/launchKernelHcc.cpp b/hipamd/samples/0_Intro/module_api/launchKernelHcc.cpp index e86e44cb24..7c90198b6a 100644 --- a/hipamd/samples/0_Intro/module_api/launchKernelHcc.cpp +++ b/hipamd/samples/0_Intro/module_api/launchKernelHcc.cpp @@ -72,6 +72,7 @@ int main(){ uint32_t one = 1; struct { + uint32_t _hidden[6]; void * _Ad; void * _Bd; } args; diff --git a/hipamd/samples/0_Intro/module_api/runKernel.cpp b/hipamd/samples/0_Intro/module_api/runKernel.cpp index 2759c02140..fb34f80b7b 100644 --- a/hipamd/samples/0_Intro/module_api/runKernel.cpp +++ b/hipamd/samples/0_Intro/module_api/runKernel.cpp @@ -68,6 +68,7 @@ int main(){ uint32_t one = 1; struct { + uint32_t _hidden[6]; void * _Ad; void * _Bd; } args; diff --git a/hipamd/samples/0_Intro/module_api/test.cl b/hipamd/samples/0_Intro/module_api/test.cl deleted file mode 100644 index 81b20cab0e..0000000000 --- a/hipamd/samples/0_Intro/module_api/test.cl +++ /dev/null @@ -1,12 +0,0 @@ -__kernel void memset(char in, __global int* out) { -int tx = get_global_id(0); -out[tx] = in; -} - - -__kernel void vadd(__global float *Ad, __global float *Bd, __global float *Cd, int N){ -int tx = get_global_id(0); -if(tx < N){ -Cd[tx] = Ad[tx] + Bd[tx]; -} -} diff --git a/hipamd/samples/0_Intro/module_api/test.co b/hipamd/samples/0_Intro/module_api/test.co deleted file mode 100755 index a3e6b991e3..0000000000 Binary files a/hipamd/samples/0_Intro/module_api/test.co and /dev/null differ diff --git a/hipamd/src/device_functions.cpp b/hipamd/src/device_functions.cpp index 615ae4d0b7..3c9bf334fa 100644 --- a/hipamd/src/device_functions.cpp +++ b/hipamd/src/device_functions.cpp @@ -23,6 +23,11 @@ THE SOFTWARE. #include #include "device_util.h" +extern "C" float __ocml_floor_f32(float); +extern "C" float __ocml_rint_f32(float); +extern "C" float __ocml_ceil_f32(float); +extern "C" float __ocml_trunc_f32(float); + struct holder64Bit{ union{ double d; @@ -151,19 +156,19 @@ __device__ long long int __double_as_longlong(double x) __device__ int __float2int_rd(float x) { - return (int)x; + return (int)__ocml_floor_f32(x); } __device__ int __float2int_rn(float x) { - return (int)x; + return (int)__ocml_rint_f32(x); } __device__ int __float2int_ru(float x) { - return (int)x; + return (int)__ocml_ceil_f32(x); } __device__ int __float2int_rz(float x) { - return (int)x; + return (int)__ocml_trunc_f32(x); } __device__ long long int __float2ll_rd(float x) diff --git a/hipamd/src/device_util.cpp b/hipamd/src/device_util.cpp index 6afc797ec6..367a4c1a4f 100644 --- a/hipamd/src/device_util.cpp +++ b/hipamd/src/device_util.cpp @@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size) { return (void*)nullptr; } - uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z; - uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x; + uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z; + uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x; uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads; uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads; @@ -932,7 +932,7 @@ __device__ unsigned long long int atomicMax(unsigned long long int* address, template __device__ T atomicCAS_impl(T* address, T compare, T val) { - // the implementation assumes the atomic is lock-free and + // the implementation assumes the atomic is lock-free and // has the same size as the non-atmoic equivalent type static_assert(sizeof(T) == sizeof(std::atomic) , "size mismatch between atomic and non-atomic types"); @@ -945,7 +945,7 @@ __device__ T atomicCAS_impl(T* address, T compare, T val) T expected = compare; - // hcc should generate a system scope atomic CAS + // hcc should generate a system scope atomic CAS std::atomic_compare_exchange_weak_explicit(u.atomic_address , &expected, val , std::memory_order_acq_rel @@ -1110,8 +1110,8 @@ __device__ void* __get_dynamicgroupbaseptr() { return hc::get_dynamic_group_segment_base_pointer(); } -__host__ void* __get_dynamicgroupbaseptr() { - return nullptr; +__host__ void* __get_dynamicgroupbaseptr() { + return nullptr; } // Precise Math Functions diff --git a/hipamd/src/grid_launch.cpp b/hipamd/src/grid_launch.cpp index fd5c2a1573..9b3cf509c5 100644 --- a/hipamd/src/grid_launch.cpp +++ b/hipamd/src/grid_launch.cpp @@ -92,5 +92,8 @@ namespace hip_impl delete static_cast(locked_stream); locked_stream = nullptr; + if(HIP_PROFILE_API) { + MARKER_END(); + } } } diff --git a/hipamd/src/hip_context.cpp b/hipamd/src/hip_context.cpp index 69d75e7f31..d3d0691e55 100644 --- a/hipamd/src/hip_context.cpp +++ b/hipamd/src/hip_context.cpp @@ -269,7 +269,7 @@ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig ) hipError_t hipCtxSynchronize ( void ) { HIP_INIT_API(1); - return ihipSynchronize(); //TODP Shall check validity of ctx? + return ihipLogStatus(ihipSynchronize()); //TODP Shall check validity of ctx? } hipError_t hipCtxGetFlags ( unsigned int* flags ) diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index ff511b5509..521b56b0e9 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -60,12 +60,12 @@ hipError_t ihipGetDeviceCount(int *count) *count = g_deviceCnt; if (*count > 0) { - e = ihipLogStatus(hipSuccess); + e = hipSuccess; } else { - e = ihipLogStatus(hipErrorNoDevice); + e = hipErrorNoDevice; } } else { - e = ihipLogStatus(hipErrorInvalidValue); + e = hipErrorInvalidValue; } return e; } @@ -73,7 +73,7 @@ hipError_t ihipGetDeviceCount(int *count) hipError_t hipGetDeviceCount(int *count) { HIP_INIT_API(count); - return ihipGetDeviceCount(count); + return ihipLogStatus(ihipGetDeviceCount(count)); } hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig) @@ -205,7 +205,7 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device hipError_t e = hipSuccess; if(pi == nullptr) { - return ihipLogStatus(hipErrorInvalidValue); + return hipErrorInvalidValue; } auto * hipDevice = ihipGetDevice(device); diff --git a/hipamd/src/hip_event.cpp b/hipamd/src/hip_event.cpp index d1ee37a45e..3664e88d2b 100644 --- a/hipamd/src/hip_event.cpp +++ b/hipamd/src/hip_event.cpp @@ -31,12 +31,9 @@ THE SOFTWARE. ihipEvent_t::ihipEvent_t(unsigned flags) + : _criticalData(this) { - _state = hipEventStatusCreated; - _stream = NULL; _flags = flags; - _timestamp = 0; - _type = hipEventTypeIndependent; }; @@ -45,56 +42,45 @@ ihipEvent_t::ihipEvent_t(unsigned flags) void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType) { - _state = hipEventStatusRecording; - _marker = *cf; - _type = eventType; - _stream = stream; + LockedAccessor_EventCrit_t crit(_criticalData); + crit->_eventData.marker(*cf); + crit->_eventData._type = eventType; + crit->_eventData._stream = stream; + crit->_eventData._state = hipEventStatusRecording; } -void ihipEvent_t::refereshEventStatus() +std::pair +ihipEvent_t::refreshEventStatus() { - bool isReady0 = locked_isReady(); - bool isReady1; - int val = 0; - if (_state == hipEventStatusRecording) { - // TODO - use completion-future functions to obtain ticks and timestamps: - hsa_signal_t *sig = static_cast (_marker.get_native_handle()); - isReady1 = locked_isReady(); - if (sig) { - val = hsa_signal_load_acquire(*sig); - if (val == 0) { + auto ecd = locked_copyCrit(); + if (ecd._state == hipEventStatusRecording) { + bool isReady1 = ecd._stream->locked_eventIsReady(this); + if (isReady1) { + LockedAccessor_EventCrit_t eCrit(_criticalData); - if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) { - _timestamp = _marker.get_end_tick(); - } else if (_type == hipEventTypeStartCommand) { - _timestamp = _marker.get_begin_tick(); - } else { - assert(0); // TODO - move to debug assert - _timestamp = 0; - } - - _state = hipEventStatusComplete; + if ((eCrit->_eventData._type == hipEventTypeIndependent) || + (eCrit->_eventData._type == hipEventTypeStopCommand)) { + eCrit->_eventData._timestamp = eCrit->_eventData.marker().get_end_tick(); + } else if (eCrit->_eventData._type == hipEventTypeStartCommand) { + eCrit->_eventData._timestamp = eCrit->_eventData.marker().get_begin_tick(); + } else { + eCrit->_eventData._timestamp = 0; + assert(0); // TODO - move to debug assert } + + eCrit->_eventData._state = hipEventStatusComplete; + + return std::pair (eCrit->_eventData._state, eCrit->_eventData._timestamp); } - } + } - if (_state != hipEventStatusComplete) { - //printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1); - } + // Not complete path here: + return std::pair (ecd._state, ecd._timestamp); } -bool ihipEvent_t::locked_isReady() -{ - return _stream->locked_eventIsReady(this); -} - -void ihipEvent_t::locked_waitComplete(hc::hcWaitMode waitMode) -{ - return _stream->locked_eventWaitComplete(this, waitMode); -} hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags) @@ -136,33 +122,43 @@ hipError_t hipEventCreate(hipEvent_t* event) return ihipLogStatus(ihipEventCreate(event, 0)); } + hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) { HIP_INIT_SPECIAL_API(TRACE_SYNC, event, stream); - if (event && event->_state != hipEventStatusUnitialized) { + auto ecd = event->locked_copyCrit(); + + if (event && ecd._state != hipEventStatusUnitialized) { stream = ihipSyncAndResolveStream(stream); - event->_stream = stream; - if (HIP_SYNC_NULL_STREAM && stream->isDefaultStream()) { - // TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0 - + // // If default stream , then wait on all queues. ihipCtx_t *ctx = ihipGetTlsDefaultCtx(); ctx->locked_syncDefaultStream(true, true); - event->_timestamp = hc::get_system_ticks(); - event->_state = hipEventStatusComplete; + { + LockedAccessor_EventCrit_t eCrit(event->criticalData()); + eCrit->_eventData.marker(hc::completion_future()); // reset event + eCrit->_eventData._stream = stream; + eCrit->_eventData._timestamp = hc::get_system_ticks(); + eCrit->_eventData._state = hipEventStatusComplete; + } return ihipLogStatus(hipSuccess); } else { - event->_state = hipEventStatusRecording; - // Clear timestamps - event->_timestamp = 0; - // Record the event in the stream: - stream->locked_recordEvent(event); + // Keep a copy outside the critical section so we lock stream first, then event - to avoid deadlock + hc::completion_future cf = stream->locked_recordEvent(event); + + { + LockedAccessor_EventCrit_t eCrit(event->criticalData()); + eCrit->_eventData.marker(cf); + eCrit->_eventData._stream = stream; + eCrit->_eventData._timestamp = 0; + eCrit->_eventData._state = hipEventStatusRecording; + } return ihipLogStatus(hipSuccess); } @@ -171,15 +167,13 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) } } + hipError_t hipEventDestroy(hipEvent_t event) { HIP_INIT_API(event); if (event) { - event->_state = hipEventStatusUnitialized; - delete event; - event = NULL; return ihipLogStatus(hipSuccess); } else { @@ -191,19 +185,27 @@ hipError_t hipEventSynchronize(hipEvent_t event) { HIP_INIT_SPECIAL_API(TRACE_SYNC, event); + if (!(event->_flags & hipEventReleaseToSystem)) { + tprintf(DB_WARN, "hipEventSynchronize on event without system-scope fence ; consider creating with hipEventReleaseToSystem\n"); + } + auto ecd = event->locked_copyCrit(); + if (event) { - if (event->_state == hipEventStatusUnitialized) { + if (ecd._state == hipEventStatusUnitialized) { return ihipLogStatus(hipErrorInvalidResourceHandle); - } else if (event->_state == hipEventStatusCreated ) { + } else if (ecd._state == hipEventStatusCreated ) { // Created but not actually recorded on any device: return ihipLogStatus(hipSuccess); - } else if (HIP_SYNC_NULL_STREAM && (event->_stream->isDefaultStream() )) { + } else if (HIP_SYNC_NULL_STREAM && (ecd._stream->isDefaultStream() )) { auto *ctx = ihipGetTlsDefaultCtx(); // TODO-HIP_SYNC_NULL_STREAM - can remove this code ctx->locked_syncDefaultStream(true, true); return ihipLogStatus(hipSuccess); } else { - event->locked_waitComplete((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); + ecd._stream->locked_eventWaitComplete( + ecd.marker(), + (event->_flags & hipEventBlockingSync) ? + hc::hcWaitModeBlocked : hc::hcWaitModeActive); return ihipLogStatus(hipSuccess); } @@ -220,44 +222,50 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) *ms = 0.0f; - if ((start == nullptr) || - (start->_flags & hipEventDisableTiming) || - (start->_state == hipEventStatusUnitialized) || (start->_state == hipEventStatusCreated) || - (stop == nullptr) || - (stop->_flags & hipEventDisableTiming) || - ( stop->_state == hipEventStatusUnitialized) || ( stop->_state == hipEventStatusCreated)) { - - // Both events must be at least recorded else return hipErrorInvalidResourceHandle - + if ((start == nullptr) || (stop == nullptr)) { status = hipErrorInvalidResourceHandle; - } else { - // Refresh status, if still recording... - start->refereshEventStatus(); - stop->refereshEventStatus(); + + auto startEcd = start->locked_copyCrit(); + auto stopEcd = stop->locked_copyCrit(); - if ((start->_state == hipEventStatusComplete) && (stop->_state == hipEventStatusComplete)) { - // Common case, we have good information for both events. + if ((start->_flags & hipEventDisableTiming) || + (startEcd._state == hipEventStatusUnitialized) || (startEcd._state == hipEventStatusCreated) || + (stop->_flags & hipEventDisableTiming) || + (stopEcd._state == hipEventStatusUnitialized) || (stopEcd._state == hipEventStatusCreated)) { - int64_t tickDiff = (stop->timestamp() - start->timestamp()); + // Both events must be at least recorded else return hipErrorInvalidResourceHandle + + status = hipErrorInvalidResourceHandle; - uint64_t freqHz; - hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); - if (freqHz) { - *ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f; - status = hipSuccess; } else { - * ms = 0.0f; - status = hipErrorInvalidValue; - } + // Refresh status, if still recording... + + auto startStatus = start->refreshEventStatus(); // pair < state, timestamp > + auto stopStatus = stop->refreshEventStatus(); // pair < state, timestamp > + + if ((startStatus.first == hipEventStatusComplete) && (stopStatus.first == hipEventStatusComplete)) { + // Common case, we have good information for both events. 'second" is the timestamp: + int64_t tickDiff = (stopStatus.second - startStatus.second); + + uint64_t freqHz; + hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz); + if (freqHz) { + *ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f; + status = hipSuccess; + } else { + * ms = 0.0f; + status = hipErrorInvalidValue; + } - } else if ((start->_state == hipEventStatusRecording) || - (stop->_state == hipEventStatusRecording)) { + } else if ((startStatus.first == hipEventStatusRecording) || + (stopStatus.first == hipEventStatusRecording)) { - status = hipErrorNotReady; - } else { + status = hipErrorNotReady; + } else { assert(0); + } } } @@ -268,7 +276,13 @@ hipError_t hipEventQuery(hipEvent_t event) { HIP_INIT_SPECIAL_API(TRACE_QUERY, event); - if ((event->_state == hipEventStatusRecording) && !event->locked_isReady()) { + if (!(event->_flags & hipEventReleaseToSystem)) { + tprintf(DB_WARN, "hipEventQuery on event without system-scope fence ; consider creating with hipEventReleaseToSystem\n"); + } + + auto ecd = event->locked_copyCrit(); + + if ((ecd._state == hipEventStatusRecording) && !ecd._stream->locked_eventIsReady(event)) { return ihipLogStatus(hipErrorNotReady); } else { return ihipLogStatus(hipSuccess); diff --git a/hipamd/src/hip_hcc.cpp b/hipamd/src/hip_hcc.cpp index 2d67c31fe7..2eff57f18b 100644 --- a/hipamd/src/hip_hcc.cpp +++ b/hipamd/src/hip_hcc.cpp @@ -47,6 +47,9 @@ THE SOFTWARE. #include "trace_helper.h" #include "env.h" +//TODO - create a stream-based debug interface as an additional option for tprintf +#define DB_PEER_CTX 0 + //================================================================================================= //Global variables: @@ -156,7 +159,7 @@ thread_local TidInfo tls_tidInfo; //================================================================================================= // Top-level "free" functions: //================================================================================================= -void recordApiTrace(std::string *fullStr, const std::string &apiStr) +uint64_t recordApiTrace(std::string *fullStr, const std::string &apiStr) { auto apiSeqNum = tls_tidInfo.apiSeqNum(); auto tid = tls_tidInfo.tid(); @@ -178,10 +181,14 @@ void recordApiTrace(std::string *fullStr, const std::string &apiStr) *fullStr += " "; *fullStr += apiStr; + uint64_t apiStartTick = getTicks(); + if (COMPILE_HIP_DB && HIP_TRACE_API) { - fprintf (stderr, "%s<c_str(), API_COLOR_END); + fprintf (stderr, "%s<c_str(), apiStartTick, API_COLOR_END); } + + return apiStartTick; } @@ -332,12 +339,11 @@ void ihipStream_t::locked_wait() // Causes current stream to wait for specified event to complete: // Note this does not provide any kind of host serialization. -void ihipStream_t::locked_streamWaitEvent(hipEvent_t event) +void ihipStream_t::locked_streamWaitEvent(ihipEventData_t &ecd) { LockedAccessor_StreamCrit_t crit(_criticalData); - - crit->_av.create_blocking_marker(event->marker(), hc::accelerator_scope); + crit->_av.create_blocking_marker(ecd.marker(), hc::accelerator_scope); } @@ -345,24 +351,28 @@ void ihipStream_t::locked_streamWaitEvent(hipEvent_t event) // Note this does not provide any kind of host serialization. bool ihipStream_t::locked_eventIsReady(hipEvent_t event) { + // Event query that returns "Complete" may cause HCC to manipulate // internal queue state so lock the stream's queue here. - LockedAccessor_StreamCrit_t crit(_criticalData); + LockedAccessor_StreamCrit_t scrit(_criticalData); - return (event->marker().is_ready()); + LockedAccessor_EventCrit_t ecrit(event->criticalData()); + + return (ecrit->_eventData.marker().is_ready()); } -void ihipStream_t::locked_eventWaitComplete(hipEvent_t event, hc::hcWaitMode waitMode) +// Waiting on event can cause HCC to reclaim stream resources - so need to lock the stream. +void ihipStream_t::locked_eventWaitComplete(hc::completion_future &marker, hc::hcWaitMode waitMode) { LockedAccessor_StreamCrit_t crit(_criticalData); - event->marker().wait(waitMode); + marker.wait(waitMode); } // Create a marker in this stream. // Save state in the event so it can track the status of the event. -void ihipStream_t::locked_recordEvent(hipEvent_t event) +hc::completion_future ihipStream_t::locked_recordEvent(hipEvent_t event) { // Lock the stream to prevent simultaneous access LockedAccessor_StreamCrit_t crit(_criticalData); @@ -378,7 +388,7 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event) scopeFlag = HIP_EVENT_SYS_RELEASE ? hc::system_scope : hc::accelerator_scope; } - event->marker(crit->_av.create_marker(scopeFlag)); + return crit->_av.create_marker(scopeFlag); }; //============================================================================= @@ -459,7 +469,9 @@ void ihipCtxCriticalBase_t::recomputePeerAgents() template<> bool ihipCtxCriticalBase_t::isPeerWatcher(const ihipCtx_t *peer) { - auto match = std::find(_peers.begin(), _peers.end(), peer); + auto match = std::find_if(_peers.begin(), _peers.end(), + [=] (const ihipCtx_t *d) { return d->getDeviceNum() == peer->getDeviceNum(); }); + return (match != std::end(_peers)); } @@ -616,7 +628,7 @@ void ihipDevice_t::locked_reset() //FIXME - Calling am_memtracker_reset is really bad since it destroyed all buffers allocated by the HCC runtime as well //such as the printf buffer. Re-initialze the printf buffer as a workaround for now. -#if (__hcc_workweek__ >= 17423) +#ifdef HC_FEATURE_PRINTF Kalmar::getContext()->initPrintfBuffer(); #endif }; @@ -700,26 +712,25 @@ int checkAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool) return access; } -hsa_status_t get_region_info(hsa_region_t region, void* data) +hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data) { hsa_status_t err; hipDeviceProp_t* p_prop = reinterpret_cast(data); uint32_t region_segment; - // Get region segment - err = hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, ®ion_segment); + // Get pool segment + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, ®ion_segment); ErrorCheck(err); switch(region_segment) { case HSA_REGION_SEGMENT_READONLY: - err = hsa_region_get_info(region, HSA_REGION_INFO_SIZE, &(p_prop->totalConstMem)); break; - /* case HSA_REGION_SEGMENT_PRIVATE: - cout<<"PRIVATE"<totalConstMem)); break; case HSA_REGION_SEGMENT_GROUP: - err = hsa_region_get_info(region, HSA_REGION_INFO_SIZE, &(p_prop->sharedMemPerBlock)); break; + err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &(p_prop->sharedMemPerBlock)); + break; default: break; } - return HSA_STATUS_SUCCESS; + return err; } @@ -750,11 +761,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) hipError_t e = hipSuccess; hsa_status_t err; - // Set some defaults in case we don't find the appropriate regions: - prop->totalGlobalMem = 0; - prop->totalConstMem = 0; - prop-> maxThreadsPerMultiProcessor = 0; - prop->regsPerBlock = 0; + memset(prop, 0, sizeof(hipDeviceProp_t)); if (_hsaAgent.handle == -1) { return hipErrorInvalidDevice; @@ -854,15 +861,18 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) prop-> maxThreadsPerMultiProcessor = prop->warpSize*max_waves_per_cu; // Get memory properties - err = hsa_agent_iterate_regions(_hsaAgent, get_region_info, prop); + err = hsa_amd_agent_iterate_memory_pools(_hsaAgent, get_pool_info, prop); + if (err == HSA_STATUS_INFO_BREAK) { + err = HSA_STATUS_SUCCESS; + } DeviceErrorCheck(err); - // Get the size of the region we are using for Accelerator Memory allocations: + // Get the size of the pool we are using for Accelerator Memory allocations: hsa_region_t *am_region = static_cast(_acc.get_hsa_am_region()); err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem); DeviceErrorCheck(err); // maxSharedMemoryPerMultiProcessor should be as the same as group memory size. - // Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size. + // Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group pool size. prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem; // Get Max memory clock frequency @@ -882,7 +892,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) prop->arch.hasGlobalFloatAtomicExch = 1; prop->arch.hasSharedInt32Atomics = 1; prop->arch.hasSharedFloatAtomicExch = 1; - prop->arch.hasFloatAtomicAdd = 0; + prop->arch.hasFloatAtomicAdd = 1; // supported with CAS loop, but is supported prop->arch.hasGlobalInt64Atomics = 1; prop->arch.hasSharedInt64Atomics = 1; prop->arch.hasDoubles = 1; @@ -890,7 +900,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop) prop->arch.hasWarpBallot = 1; prop->arch.hasWarpShuffle = 1; prop->arch.hasFunnelShift = 0; // TODO-hcc - prop->arch.hasThreadFenceSystem = 0; // TODO-hcc + prop->arch.hasThreadFenceSystem = 1; prop->arch.hasSyncThreadsExt = 0; // TODO-hcc prop->arch.hasSurfaceFuncs = 0; // TODO-hcc prop->arch.has3dGrid = 1; @@ -1582,7 +1592,9 @@ void ihipPostLaunchKernel(const char *kernelName, hipStream_t stream, grid_launc tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n"); stream->lockclose_postKernelCommand(kernelName, lp.av); - MARKER_END(); + if(HIP_PROFILE_API) { + MARKER_END(); + } } //================================================================================================= @@ -1677,6 +1689,9 @@ const char *ihipErrorString(hipError_t hip_error) // So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx. bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo) { + if (copyEngineCtx == nullptr) { + return false; + } // Make sure this is a device-to-device copy with all memory available to the requested copy engine // @@ -1684,11 +1699,18 @@ bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPoin if (dstPtrInfo->_sizeBytes == 0) { return false; } else { +#if USE_APP_PTR_FOR_CTX + ihipCtx_t *dstCtx = static_cast (dstPtrInfo->_appPtr); +#else ihipCtx_t *dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId); +#endif if (copyEngineCtx != dstCtx) { // Only checks peer list if contexts are different LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData()); - //tprintf(DB_SYNC, "dstCrit lock succeeded\n"); +#if DB_PEER_CTX + std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " dstCtx =" << dstCtx << " peerCnt=" + << ctxCrit->peerCnt() << "\n"; +#endif if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { return false; }; @@ -1696,16 +1718,22 @@ bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPoin } - // TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here: if (srcPtrInfo->_sizeBytes == 0) { return false; } else { +#if USE_APP_PTR_FOR_CTX + ihipCtx_t *srcCtx = static_cast (srcPtrInfo->_appPtr); +#else ihipCtx_t *srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId); +#endif if (copyEngineCtx != srcCtx) { // Only checks peer list if contexts are different LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData()); - //tprintf(DB_SYNC, "srcCrit lock succeeded\n"); +#if DB_PEER_CTX + std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " srcCtx =" << srcCtx << " peerCnt=" + << ctxCrit->peerCnt() << "\n"; +#endif if (!ctxCrit->isPeerWatcher(copyEngineCtx)) { return false; }; @@ -1805,7 +1833,7 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, } } else { *forceUnpinnedCopy = true; - tprintf (DB_COPY, "P2P: Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n", + tprintf (DB_COPY, "Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n", *copyDevice ? (*copyDevice)->getDeviceNum() : -1, *copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0); if (HIP_FAIL_SOC & 0x2) { @@ -1820,10 +1848,11 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind, void printPointerInfo(unsigned dbFlag, const char *tag, const void *ptr, const hc::AmPointerInfo &ptrInfo) { - tprintf (dbFlag, " %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d registered=%d\n", + tprintf (dbFlag, " %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d registered=%d allocSeqNum=%zu, appAllocationFlags=%x, appPtr=%p\n", tag, ptr, ptrInfo._hostPointer, ptrInfo._devicePointer, ptrInfo._sizeBytes, - ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged); + ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged, + ptrInfo._allocSeqNum, ptrInfo._appAllocationFlags, ptrInfo._appPtr); } @@ -1871,12 +1900,14 @@ void tailorPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeByte }; -bool getTailoredPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes) +bool getTailoredPtrInfo(const char *tag, hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes) { bool tracked = (hc::am_memtracker_getinfo(ptrInfo, ptr) == AM_SUCCESS); + printPointerInfo(DB_COPY, tag, ptr, *ptrInfo); if (tracked) { tailorPtrInfo(ptrInfo, ptr, sizeBytes); + printPointerInfo(DB_COPY, " mod", ptr, *ptrInfo); } return tracked; @@ -1906,8 +1937,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes, hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); #endif - bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes); - bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes); + bool dstTracked = getTailoredPtrInfo(" dst", &dstPtrInfo, dst, sizeBytes); + bool srcTracked = getTailoredPtrInfo(" src", &srcPtrInfo, src, sizeBytes); // Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not valid, so check it here: @@ -2034,21 +2065,18 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0); hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0); #endif - bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes); - bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes); + tprintf (DB_COPY, "copyASync dst=%p src=%p, sz=%zu\n", dst, src, sizeBytes); + bool dstTracked = getTailoredPtrInfo(" dst", &dstPtrInfo, dst, sizeBytes); + bool srcTracked = getTailoredPtrInfo(" src", &srcPtrInfo, src, sizeBytes); hc::hcCommandKind hcCopyDir; ihipCtx_t *copyDevice; bool forceUnpinnedCopy; resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, ©Device, &forceUnpinnedCopy); - tprintf (DB_COPY, "copyASync copyDev:%d dst=%p (phys_dev:%d, isDevMem:%d) src=%p(phys_dev:%d, isDevMem:%d) sz=%zu dir=%s forceUnpinnedCopy=%d\n", + tprintf (DB_COPY, " copyDev:%d dir=%s forceUnpinnedCopy=%d\n", copyDevice ? copyDevice->getDeviceNum():-1, - dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem, - src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem, - sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy); - printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo); - printPointerInfo(DB_COPY, " src", src, srcPtrInfo); + hcMemcpyStr(hcCopyDir), forceUnpinnedCopy); // "tracked" really indicates if the pointer's virtual address is available in the GPU address space. // If both pointers are not tracked, we need to fall back to a sync copy. diff --git a/hipamd/src/hip_hcc_internal.h b/hipamd/src/hip_hcc_internal.h index 4b7e533a4c..6b51f5c202 100644 --- a/hipamd/src/hip_hcc_internal.h +++ b/hipamd/src/hip_hcc_internal.h @@ -32,10 +32,19 @@ THE SOFTWARE. #include "env.h" -#if defined(__HCC__) && (__hcc_workweek__ < 16354) +#if (__hcc_workweek__ < 16354) #error("This version of HIP requires a newer version of HCC."); #endif +// Use the __appPtr field in the am memtracker to store the context. +// Requires a bug fix in HCC +#if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0) +#define USE_APP_PTR_FOR_CTX 1 +#endif + + + + #define USE_IPC 1 //--- @@ -128,6 +137,7 @@ extern std::vector g_dbStopTriggers; class ihipStream_t; class ihipDevice_t; class ihipCtx_t; +struct ihipEventData_t; // Color defs for debug messages: #define KNRM "\x1B[0m" @@ -143,10 +153,12 @@ extern const char *API_COLOR; extern const char *API_COLOR_END; -// If set, thread-safety is enforced on all stream functions. -// Stream functions will acquire a mutex before entering critical sections. -#define STREAM_THREAD_SAFE 1 +// If set, thread-safety is enforced on all event/stream/ctx/device functions. +// Can disable for performance or functional experiments - in this case +// the code uses a dummy "no-op" mutex. +#define EVENT_THREAD_SAFE 1 +#define STREAM_THREAD_SAFE 1 #define CTX_THREAD_SAFE 1 @@ -209,7 +221,8 @@ extern const char *API_COLOR_END; #define DB_SYNC 1 /* 0x02 - trace synchronization pieces */ #define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */ #define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */ -#define DB_MAX_FLAG 4 +#define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */ +#define DB_MAX_FLAG 5 // When adding a new debug flag, also add to the char name table below. // // @@ -226,6 +239,7 @@ static const DbName dbName [] = {KYEL, "sync"}, {KCYN, "mem"}, {KMAG, "copy"}, + {KRED, "warn"}, }; @@ -244,23 +258,28 @@ static const DbName dbName [] = #endif - +static inline uint64_t getTicks() +{ + return hc::get_system_ticks(); +} //--- -extern void recordApiTrace(std::string *fullStr, const std::string &apiStr); +extern uint64_t recordApiTrace(std::string *fullStr, const std::string &apiStr); #if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1) #define API_TRACE(forceTrace, ...)\ +uint64_t hipApiStartTick;\ {\ tls_tidInfo.incApiSeqNum();\ if (forceTrace || (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1<>%s\n", (localHipStatus == 0) ? API_COLOR:KRED, tls_tidInfo.tid(),tls_tidInfo.apiSeqNum(), __func__, localHipStatus, ihipErrorString(localHipStatus), API_COLOR_END);\ + auto ticks = getTicks() - hipApiStartTick;\ + fprintf(stderr, " %ship-api tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \ + (localHipStatus == 0) ? API_COLOR:KRED, tls_tidInfo.tid(),tls_tidInfo.apiSeqNum(), \ + __func__, localHipStatus, ihipErrorString(localHipStatus), ticks, API_COLOR_END);\ }\ if (HIP_PROFILE_API) { MARKER_END(); }\ localHipStatus;\ @@ -371,6 +393,12 @@ class FakeMutex void unlock() { } }; +#if EVENT_THREAD_SAFE +typedef std::mutex EventMutex; +#else +#warning "Stream thread-safe disabled" +typedef FakeMutex EventMutex; +#endif #if STREAM_THREAD_SAFE typedef std::mutex StreamMutex; @@ -521,11 +549,11 @@ public: hc::accelerator_view* locked_getAv() { LockedAccessor_StreamCrit_t crit(_criticalData); return &(crit->_av); }; - void locked_streamWaitEvent(hipEvent_t event); - void locked_recordEvent(hipEvent_t event); + void locked_streamWaitEvent(ihipEventData_t & event); + hc::completion_future locked_recordEvent(hipEvent_t event); bool locked_eventIsReady(hipEvent_t event); - void locked_eventWaitComplete(hipEvent_t event, hc::hcWaitMode waitMode); + void locked_eventWaitComplete(hc::completion_future &marker, hc::hcWaitMode waitMode); ihipStreamCritical_t &criticalData() { return _criticalData; }; @@ -609,32 +637,76 @@ enum ihipEventType_t { hipEventTypeStopCommand, }; + +struct ihipEventData_t +{ + ihipEventData_t() { + _state = hipEventStatusCreated; + _stream = NULL; + _timestamp = 0; + _type = hipEventTypeIndependent; + }; + + void marker(const hc::completion_future & marker) { _marker = marker; }; + hc::completion_future & marker() { return _marker; } + uint64_t timestamp() const { return _timestamp; } ; + ihipEventType_t type() const { return _type; }; + + ihipEventType_t _type; + hipEventStatus_t _state; + hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual stream when recorded + uint64_t _timestamp; // store timestamp, may be set on host or by marker. +private: + hc::completion_future _marker; +}; + + +//============================================================================= +//class ihipEventCriticalBase_t +template +class ihipEventCriticalBase_t : LockedBase +{ +public: + ihipEventCriticalBase_t(const ihipEvent_t *parentEvent) : + _parent(parentEvent) + {} + ~ihipEventCriticalBase_t() {}; + + // Keep data in structure so it can be easily copied into snapshots + // (used to reduce lock contention and preserve correct lock order) + ihipEventData_t _eventData; + +private: + const ihipEvent_t *_parent; + friend class LockedAccessor; +}; + +typedef ihipEventCriticalBase_t ihipEventCritical_t; + +typedef LockedAccessor LockedAccessor_EventCrit_t; + // internal hip event structure. class ihipEvent_t { public: ihipEvent_t(unsigned flags); void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType); - void refereshEventStatus(); - hc::completion_future & marker() { return _marker; } - void marker(hc::completion_future cf) { _marker = cf; }; + std::pair refreshEventStatus(); // returns pair - bool locked_isReady(); - void locked_waitComplete(hc::hcWaitMode waitMode); - uint64_t timestamp() const { return _timestamp; } ; - ihipEventType_t type() const { return _type; }; + // Return a copy of the critical state. The critical data is locked during the copy. + ihipEventData_t locked_copyCrit() { + LockedAccessor_EventCrit_t crit(_criticalData); + return _criticalData._eventData; + }; + + ihipEventCritical_t &criticalData() { return _criticalData; }; public: - hipEventStatus_t _state; - - hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual stream when recorded unsigned _flags; - private: - hc::completion_future _marker; - ihipEventType_t _type; - uint64_t _timestamp; // store timestamp, may be set on host or by marker. + ihipEventCritical_t _criticalData; + friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream); } ; @@ -652,7 +724,6 @@ public: }; ~ihipDeviceCriticalBase_t() { - } // Contexts: diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index a8324c5729..8a5225d499 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -61,11 +61,15 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags) auto device = ctx->getWriteableDevice(); +#if USE_APP_PTR_FOR_CTX + hc::am_memtracker_update(ptr, device->_deviceId, hipFlags, ctx); +#else hc::am_memtracker_update(ptr, device->_deviceId, hipFlags); +#endif if (shareWithAll) { hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr); - tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); + tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt); if (s != HSA_STATUS_SUCCESS) { ret = -1; } @@ -122,7 +126,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool if (HIP_INIT_ALLOC != -1) { // TODO , dont' call HIP API directly here: hipMemset(ptr, HIP_INIT_ALLOC, sizeBytes); - } + } if (ptr != nullptr) { int r = sharePtr(ptr, ctx, shareWithAll, hipFlags); @@ -251,7 +255,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes) hip_status = hipErrorMemoryAllocation; } - } + } return ihipLogStatus(hip_status); @@ -284,10 +288,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) } - const unsigned supportedFlags = hipHostMallocPortable - | hipHostMallocMapped - | hipHostMallocWriteCombined - | hipHostMallocCoherent + const unsigned supportedFlags = hipHostMallocPortable + | hipHostMallocMapped + | hipHostMallocWriteCombined + | hipHostMallocCoherent | hipHostMallocNonCoherent; @@ -300,7 +304,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags) hip_status = hipErrorInvalidValue; } else { auto device = ctx->getWriteableDevice(); - + unsigned amFlags = 0; if (flags & hipHostMallocCoherent) { amFlags = amHostCoherent; @@ -581,7 +585,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array, hsa_ext_image_data_info_t imageInfo; hsa_status_t status = hsa_ext_image_data_get_info(*agent, &imageDescriptor, permission, &imageInfo); size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment; - + *ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false, am_flags, 0, alignment); if (size && (*ptr == NULL)) { @@ -660,7 +664,11 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags) vecAcc.push_back(ihipGetDevice(i)->_acc); } am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], vecAcc.size()); +#if USE_APP_PTR_FOR_CTX + hc::am_memtracker_update(hostPtr, device->_deviceId, flags, ctx); +#else hc::am_memtracker_update(hostPtr, device->_deviceId, flags); +#endif tprintf(DB_MEM, " %s registered ptr=%p and allowed access to %zu peers\n", __func__, hostPtr, vecAcc.size()); if(am_status == AM_SUCCESS){ diff --git a/hipamd/src/hip_module.cpp b/hipamd/src/hip_module.cpp index dee69e7ba0..38411f2347 100644 --- a/hipamd/src/hip_module.cpp +++ b/hipamd/src/hip_module.cpp @@ -27,6 +27,7 @@ THE SOFTWARE. #include #include #include +#include #include #include @@ -217,7 +218,7 @@ namespace using namespace ELFIO; using namespace std; - static constexpr pair r{0, 0}; + static const pair r{0, 0}; for (auto i = 0u; i != section.get_symbols_num(); ++i) { // TODO: this is boyscout code, caching the temporaries @@ -374,6 +375,8 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname) hipError_t hipModuleUnload(hipModule_t hmod) { + HIP_INIT_API(hmod); + // TODO - improve this synchronization so it is thread-safe. // Currently we want for all inflight activity to complete, but don't prevent another // thread from launching new kernels before we finish this operation. @@ -408,7 +411,7 @@ hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char hipError_t ret = hipSuccess; if (name == nullptr){ - return ihipLogStatus(hipErrorInvalidValue); + return (hipErrorInvalidValue); } if (ctx == nullptr){ @@ -431,7 +434,7 @@ hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char hsa_executable_symbol_t symbol; status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &symbol); if(status != HSA_STATUS_SUCCESS){ - return ihipLogStatus(hipErrorNotFound); + return hipErrorNotFound; } status = hsa_executable_symbol_get_info(symbol, @@ -503,11 +506,11 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f, if(config[0] == HIP_LAUNCH_PARAM_BUFFER_POINTER && config[2] == HIP_LAUNCH_PARAM_BUFFER_SIZE && config[4] == HIP_LAUNCH_PARAM_END){ kernArgSize = *(size_t*)(config[3]); } else { - return ihipLogStatus(hipErrorNotInitialized); + return hipErrorNotInitialized; } }else{ - return ihipLogStatus(hipErrorInvalidValue); + return hipErrorInvalidValue; } @@ -611,6 +614,125 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f, sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent)); } +namespace +{ + struct Agent_global { + std::string name; + hipDeviceptr_t address; + std::uint32_t byte_cnt; + }; + + inline + void* address(hsa_executable_symbol_t x) + { + void* r = nullptr; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r); + + return r; + } + + inline + std::string name(hsa_executable_symbol_t x) + { + uint32_t sz = 0u; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz); + + std::string r(sz, '\0'); + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front()); + + return r; + } + + inline + std::uint32_t size(hsa_executable_symbol_t x) + { + std::uint32_t r = 0; + hsa_executable_symbol_get_info( + x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r); + + return r; + } + + inline + void track(const Agent_global& x) + { + tprintf( + DB_MEM, + " add variable '%s' with ptr=%p size=%u to tracker\n", + x.name.c_str(), + x.address, + x.byte_cnt); + + auto device = ihipGetTlsDefaultCtx()->getWriteableDevice(); + + hc::AmPointerInfo ptr_info( + nullptr, + x.address, + x.address, + x.byte_cnt, + device->_acc, + true, + false); + hc::am_memtracker_add(x.address, ptr_info); + hc::am_memtracker_update(x.address, device->_deviceId, 0u); + } + + template> + inline + hsa_status_t copy_agent_global_variables( + hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t x, void* out) + { + assert(out); + + hsa_symbol_kind_t t = {}; + hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &t); + + if (t == HSA_SYMBOL_KIND_VARIABLE) { + static_cast(out)->push_back( + Agent_global{name(x), address(x), size(x)}); + + track(static_cast(out)->back()); + } + + return HSA_STATUS_SUCCESS; + } + + inline + hsa_agent_t this_agent() + { + auto ctx = ihipGetTlsDefaultCtx(); + + if (!ctx) throw std::runtime_error{"No active HIP context."}; + + auto device = ctx->getDevice(); + + if (!device) throw std::runtime_error{"No device available for HIP."}; + + ihipDevice_t *currentDevice = ihipGetDevice(device->_deviceId); + + if (!currentDevice) { + throw std::runtime_error{"No active device for HIP"}; + } + + return currentDevice->_hsaAgent; + } + + inline + std::vector read_agent_globals(hipModule_t hmodule) + { + std::vector r; + + + hsa_executable_iterate_agent_symbols( + hmodule->executable, this_agent(), copy_agent_global_variables, &r); + + return r; + } +} + hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, hipModule_t hmod, const char* name) { @@ -623,11 +745,37 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes, return ihipLogStatus(hipErrorNotInitialized); } else{ - hipFunction_t func; - ihipModuleGetSymbol(&func, hmod, name); - *bytes = PrintSymbolSizes(hmod->ptr, name) + sizeof(amd_kernel_code_t); - *dptr = reinterpret_cast(func->_object); - return ihipLogStatus(ret); + static std::unordered_map< + hipModule_t, std::vector> agent_globals; + + // TODO: this is not particularly robust. + if (agent_globals.count(hmod) == 0) { + static std::mutex mtx; + std::lock_guard lck{mtx}; + + if (agent_globals.count(hmod) == 0) { + agent_globals.emplace(hmod, read_agent_globals(hmod)); + } + } + + // TODO: This is unsafe iff some other emplacement triggers rehashing. + // It will have to be properly fleshed out in the future. + const auto it0 = agent_globals.find(hmod); + if (it0 == agent_globals.cend()) { + throw std::runtime_error{"agent_globals data structure corrupted."}; + } + + const auto it1 = std::find_if( + it0->second.cbegin(), + it0->second.cend(), + [=](const Agent_global& x) { return x.name == name; }); + + if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound); + + *dptr = it1->address; + *bytes = it1->byte_cnt; + + return ihipLogStatus(hipSuccess); } } diff --git a/hipamd/src/hip_stream.cpp b/hipamd/src/hip_stream.cpp index 7dd6efd39c..dab31dad62 100644 --- a/hipamd/src/hip_stream.cpp +++ b/hipamd/src/hip_stream.cpp @@ -93,18 +93,23 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int hipError_t e = hipSuccess; + auto ecd = event->locked_copyCrit(); + if (event == nullptr) { e = hipErrorInvalidResourceHandle; - } else if (event->_state != hipEventStatusUnitialized) { + } else if ((ecd._state != hipEventStatusUnitialized) && + (ecd._state != hipEventStatusCreated)) { if (HIP_SYNC_STREAM_WAIT || (HIP_SYNC_NULL_STREAM && (stream == 0))) { // conservative wait on host for the specified event to complete: - event->locked_waitComplete((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); + // return _stream->locked_eventWaitComplete(this, waitMode); + // + ecd._stream->locked_eventWaitComplete(ecd.marker(), (event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive); } else { stream = ihipSyncAndResolveStream(stream); - // This will user create_blocking_marker to wait on the specified queue. - stream->locked_streamWaitEvent(event); + // This will use create_blocking_marker to wait on the specified queue. + stream->locked_streamWaitEvent(ecd); } } // else event not recorded, return immediately and don't create marker. @@ -140,7 +145,6 @@ hipError_t hipStreamQuery(hipStream_t stream) //--- hipError_t hipStreamSynchronize(hipStream_t stream) { - HIP_INIT_API(stream); HIP_INIT_SPECIAL_API(TRACE_SYNC, stream); hipError_t e = hipSuccess; diff --git a/hipamd/tests/README.md b/hipamd/tests/README.md index 27cde7c534..a9638ba95f 100644 --- a/hipamd/tests/README.md +++ b/hipamd/tests/README.md @@ -75,6 +75,18 @@ RUN: %t CMAKE_TEST_NAME EXCLUDE_HIP_PLATFORM +``` +cmake_command: refers to any of the commands supported by ```cmake -E``` as specified in the [cmake documentation](https://cmake.org/cmake/help/latest/manual/cmake.1.html#command-line-tool-mode). Note that the commands are limited by the version of cmake the user is running. +options_to_cmake_command: refers to the arguments supported by the specific cmake_command. The arguments are parsed by HIT to replace special markers. The markers supported by HIT are: +%S: Refers to the source directory containing the current source file. +%B: Refers to the build directory for the current cmake project i.e. CMAKE_CURRENT_BINARY_DIR. + + ### Running tests: ``` ctest diff --git a/hipamd/tests/hipify-clang/allocators.cu b/hipamd/tests/hipify-clang/allocators.cu new file mode 100644 index 0000000000..3f130be227 --- /dev/null +++ b/hipamd/tests/hipify-clang/allocators.cu @@ -0,0 +1,18 @@ +// RUN: %run_test hipify "%s" "%t" %cuda_args + +#pragma once + +#include + + +/** + * Allocate GPU memory for `count` elements of type `T`. + */ +template +static T* gpuMalloc(size_t count) { + T* ret = nullptr; + // CHECK: hipMalloc(&ret, count * sizeof(T)); + cudaMalloc(&ret, count * sizeof(T)); + return ret; +} + diff --git a/hipamd/tests/hipify-clang/axpy.cu b/hipamd/tests/hipify-clang/axpy.cu index 2fd62ac344..071c503a35 100644 --- a/hipamd/tests/hipify-clang/axpy.cu +++ b/hipamd/tests/hipify-clang/axpy.cu @@ -2,6 +2,8 @@ #include +// CHECK: #include +#include #define TOKEN_PASTE(X, Y) X ## Y #define ARG_LIST_AS_MACRO a, device_x, device_y @@ -33,8 +35,13 @@ int main(int argc, char* argv[]) { // CHECK: hipMalloc(&device_x, kDataLen * sizeof(float)); cudaMalloc(&device_x, kDataLen * sizeof(float)); +#ifdef HERRING // CHECK: hipMalloc(&device_y, kDataLen * sizeof(float)); cudaMalloc(&device_y, kDataLen * sizeof(float)); +#else + // CHECK: hipMalloc(&device_y, kDataLen * sizeof(double)); + cudaMalloc(&device_y, kDataLen * sizeof(double)); +#endif // CHECK: hipMemcpy(device_x, host_x, kDataLen * sizeof(float), hipMemcpyHostToDevice); cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice); diff --git a/hipamd/tests/hipify-clang/concurentKernels.cu b/hipamd/tests/hipify-clang/concurentKernels.cu index e369baaf3e..27e9e0d0e7 100644 --- a/hipamd/tests/hipify-clang/concurentKernels.cu +++ b/hipamd/tests/hipify-clang/concurentKernels.cu @@ -99,6 +99,7 @@ int main(int argc, char **argv) // use command-line specified CUDA device, otherwise use device with highest Gflops/s cuda_device = findCudaDevice(argc, (const char **)argv); + // CHECK: hipDeviceProp_t deviceProp; cudaDeviceProp deviceProp; // CHECK: checkCudaErrors(hipGetDevice(&cuda_device)); checkCudaErrors(cudaGetDevice(&cuda_device)); @@ -135,6 +136,7 @@ int main(int argc, char **argv) checkCudaErrors(cudaStreamCreate(&(streams[i]))); } + // CHECK: hipEvent_t start_event, stop_event; // create CUDA event handles cudaEvent_t start_event, stop_event; diff --git a/hipamd/tests/hit/HIT.cmake b/hipamd/tests/hit/HIT.cmake index fd0001214e..82e8508dcd 100644 --- a/hipamd/tests/hit/HIT.cmake +++ b/hipamd/tests/hit/HIT.cmake @@ -155,6 +155,20 @@ macro(HIT_ADD_FILES _dir _label _parent) endif() endforeach() + # Run cmake commands + execute_process(COMMAND ${HIP_SRC_PATH}/tests/hit/parser --cmakeCMDs ${file} + OUTPUT_VARIABLE _contents + ERROR_QUIET + WORKING_DIRECTORY ${_dir} + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REGEX REPLACE "\n" ";" _contents "${_contents}") + string(REGEX REPLACE "%S" ${_dir} _contents "${_contents}") + string(REGEX REPLACE "%B" ${CMAKE_CURRENT_BINARY_DIR} _contents "${_contents}") + foreach(_cmd ${_contents}) + string(REGEX REPLACE " " ";" _cmd "${_cmd}") + execute_process(COMMAND ${CMAKE_COMMAND} -E ${_cmd}) + endforeach() + # Add tests execute_process(COMMAND ${HIP_SRC_PATH}/tests/hit/parser --runCMDs ${file} OUTPUT_VARIABLE _contents diff --git a/hipamd/tests/hit/parser b/hipamd/tests/hit/parser index 3d851752e4..f77bd524f3 100755 --- a/hipamd/tests/hit/parser +++ b/hipamd/tests/hit/parser @@ -8,7 +8,7 @@ use File::Spec; sub parse_file { my $file = shift; (my $exe = $file) =~ s/\.[^.]+$//g; - my (@buildCMDs, @runCMDs, @runNamedCMDs); + my (@buildCMDs, @runCMDs, @runNamedCMDs, @cmakeCMDs); if (open (SOURCE, '<:encoding(UTF-8)', "$file")) { while () { my $line=$_; @@ -36,10 +36,17 @@ sub parse_file { $line =~ s/\R//g; # Remove line endings push @runNamedCMDs, $line; } + # Look for CMAKECMD instructions + if ($line =~ /^ \* CMAKECMD:/) { + $line =~ s/^ \* CMAKECMD: //g; # Remove " * CMAKECMD: " + # Substitute %S -> srcdir and %B -> builddir happens in cmake + $line =~ s/\R//g; # Remove line endings + push @cmakeCMDs, $line; + } } close(SOURCE); } - return (\@buildCMDs, \@runCMDs, \@runNamedCMDs); + return (\@buildCMDs, \@runCMDs, \@runNamedCMDs, \@cmakeCMDs); } # Exit if no arguments specified @@ -53,8 +60,9 @@ my @options = (); my $retBuildCMDs = 0; my $retRunCMDs = 0; my $retRunNamedCMDs = 0; +my $retCmakeCMDs = 0; foreach $arg (@ARGV) { - if ($retBuildCMDs or $retRunCMDs or $retRunNamedCMDs) { + if ($retBuildCMDs or $retRunCMDs or $retRunNamedCMDs or $retCmakeCMDs) { push (@options, $arg); } if ($arg eq '--buildCMDs') { @@ -66,18 +74,21 @@ foreach $arg (@ARGV) { if ($arg eq '--runNamedCMDs') { $retRunNamedCMDs = 1; } + if ($arg eq '--cmakeCMDs') { + $retCmakeCMDs = 1; + } } # Atleast one command needs to be specified -if (($retBuildCMDs eq 0) and ($retRunCMDs eq 0) and ($retRunNamedCMDs eq 0)) { - die "Usage: $0 <--buildCMDs|--runCMDs|--runNamedCMDs> FILENAMEs\n"; +if (($retBuildCMDs eq 0) and ($retRunCMDs eq 0) and ($retRunNamedCMDs eq 0) and ($retCmakeCMDs eq 0)) { + die "Usage: $0 <--buildCMDs|--runCMDs|--runNamedCMDs|--cmakeCMDs> FILENAMEs\n"; } # Iterate over input files foreach $file (@options) { # Convert absolute path to path relative to working directory my $relfile = File::Spec->abs2rel($file); - my ($buildCMDs, $runCMDs, $runNamedCMDs) = parse_file("$relfile"); + my ($buildCMDs, $runCMDs, $runNamedCMDs, $cmakeCMDs) = parse_file("$relfile"); if ($retBuildCMDs) { # print "BuildCMDs:\n"; print "$_\n" for @$buildCMDs; @@ -90,6 +101,10 @@ foreach $file (@options) { # print "RunNamedCMDs:\n"; print "$_\n" for @$runNamedCMDs; } + if ($retCmakeCMDs) { + # print "CmakeCMDs:\n"; + print "$_\n" for @$cmakeCMDs; + } } # vim: ts=4:sw=4:expandtab:smartindent diff --git a/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp b/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp index dac56bf709..c8c2e644c3 100644 --- a/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp +++ b/hipamd/tests/src/Functional/device/hipFuncDeviceSynchronize.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #define NUM_STREAMS 2 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; // Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below if(tx == 0){ for(int i = 0; i #include -#include "hip/hip_runtime_api.h" +#include "hip/hip_runtime.h" +#include "test_common.h" #define LEN 64 #define HALF_SIZE 64*sizeof(__half) #define HALF2_SIZE 64*sizeof(__half2) -#if __HIP_ARCH_GFX803__ > 0 +#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) { - int tx = hipThreadIdx_x; + int tx = threadIdx.x; __half a = A[tx]; __half b = B[tx]; __half c = C[tx]; @@ -44,7 +45,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) { } __global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) { - int tx = hipThreadIdx_x; + int tx = threadIdx.x; __half2 a = A[tx]; __half2 b = B[tx]; __half2 c = C[tx]; @@ -61,15 +62,21 @@ __global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C #endif int main(){ - __half *A, *B, *C; - hipMalloc(&A, HALF_SIZE); - hipMalloc(&B, HALF_SIZE); - hipMalloc(&C, HALF_SIZE); - hipLaunchKernel(__halfMath, dim3(1,1,1), dim3(LEN,1,1), 0, 0, A, B, C); - __half2 *A2, *B2, *C2; - hipMalloc(&A, HALF2_SIZE); - hipMalloc(&B, HALF2_SIZE); - hipMalloc(&C, HALF2_SIZE); - hipLaunchKernel(__half2Math, dim3(1,1,1), dim3(LEN,1,1), 0, 0, A2, B2, C2); - + __half *A, *B, *C; + hipMalloc(&A, HALF_SIZE); + hipMalloc(&B, HALF_SIZE); + hipMalloc(&C, HALF_SIZE); + hipLaunchKernel(__halfMath, dim3(1,1,1), dim3(LEN,1,1), 0, 0, A, B, C); + hipFree(A); + hipFree(B); + hipFree(C); + __half2 *A2, *B2, *C2; + hipMalloc(&A2, HALF2_SIZE); + hipMalloc(&B2, HALF2_SIZE); + hipMalloc(&C2, HALF2_SIZE); + hipLaunchKernel(__half2Math, dim3(1,1,1), dim3(LEN,1,1), 0, 0, A2, B2, C2); + hipFree(A2); + hipFree(B2); + hipFree(C2); + passed(); } diff --git a/hipamd/tests/src/deviceLib/hipThreadFence.cpp b/hipamd/tests/src/deviceLib/hipThreadFence.cpp index 1bd9fbe02c..2f73b68529 100644 --- a/hipamd/tests/src/deviceLib/hipThreadFence.cpp +++ b/hipamd/tests/src/deviceLib/hipThreadFence.cpp @@ -33,7 +33,7 @@ THE SOFTWARE. __global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out) { - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; In4[tid] = In1[tid] + In2[tid]; __threadfence(); In3[tid] = In3[tid] + In4[tid]; diff --git a/hipamd/tests/src/deviceLib/hip_anyall.cpp b/hipamd/tests/src/deviceLib/hip_anyall.cpp index bba7915052..f0b314ce8d 100644 --- a/hipamd/tests/src/deviceLib/hip_anyall.cpp +++ b/hipamd/tests/src/deviceLib/hip_anyall.cpp @@ -37,9 +37,9 @@ __global__ void warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift) { - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - device_any[hipThreadIdx_x>>pshift] = __any(tid -77); - device_all[hipThreadIdx_x>>pshift] = __all(tid -77); + int tid = threadIdx.x + blockIdx.x * blockDim.x; + device_any[threadIdx.x>>pshift] = __any(tid -77); + device_all[threadIdx.x>>pshift] = __all(tid -77); } int main(int argc, char *argv[]) @@ -49,7 +49,7 @@ int main(int argc, char *argv[]) warpSize = devProp.warpSize; int w = warpSize; - pshift = 0; + pshift = 0; while (w >>= 1) ++pshift; printf ("warpSize=%d pshift=%d\n", warpSize, pshift); diff --git a/hipamd/tests/src/deviceLib/hip_ballot.cpp b/hipamd/tests/src/deviceLib/hip_ballot.cpp index 742c47a065..14b8f314a1 100644 --- a/hipamd/tests/src/deviceLib/hip_ballot.cpp +++ b/hipamd/tests/src/deviceLib/hip_ballot.cpp @@ -34,12 +34,12 @@ __global__ void gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift) { - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; - const unsigned int warp_num = hipThreadIdx_x >> pshift; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const unsigned int warp_num = threadIdx.x >> pshift; #ifdef __HIP_PLATFORM_HCC__ - atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245))); #else - atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); + atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245))); #endif } diff --git a/hipamd/tests/src/deviceLib/hip_brev.cpp b/hipamd/tests/src/deviceLib/hip_brev.cpp index 855a8bec47..c08c39dec9 100644 --- a/hipamd/tests/src/deviceLib/hip_brev.cpp +++ b/hipamd/tests/src/deviceLib/hip_brev.cpp @@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_clz.cpp b/hipamd/tests/src/deviceLib/hip_clz.cpp index bdb31f3e8d..53fd611184 100644 --- a/hipamd/tests/src/deviceLib/hip_clz.cpp +++ b/hipamd/tests/src/deviceLib/hip_clz.cpp @@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_ffs.cpp b/hipamd/tests/src/deviceLib/hip_ffs.cpp index c855ede060..49530bb298 100644 --- a/hipamd/tests/src/deviceLib/hip_ffs.cpp +++ b/hipamd/tests/src/deviceLib/hip_ffs.cpp @@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_popc.cpp b/hipamd/tests/src/deviceLib/hip_popc.cpp index e503e55b42..19dafb4d43 100644 --- a/hipamd/tests/src/deviceLib/hip_popc.cpp +++ b/hipamd/tests/src/deviceLib/hip_popc.cpp @@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp, unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp index 171ff1afd0..63d50e881e 100644 --- a/hipamd/tests/src/deviceLib/hip_test_ldg.cpp +++ b/hipamd/tests/src/deviceLib/hip_test_ldg.cpp @@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp, T* a, const T* bm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { diff --git a/hipamd/tests/src/deviceLib/hip_test_make_type.cpp b/hipamd/tests/src/deviceLib/hip_test_make_type.cpp index ef493ac923..6eba236e12 100644 --- a/hipamd/tests/src/deviceLib/hip_test_make_type.cpp +++ b/hipamd/tests/src/deviceLib/hip_test_make_type.cpp @@ -40,13 +40,13 @@ THE SOFTWARE. #define THREADS_PER_BLOCK_Z 1 -__global__ void +__global__ void vectoradd_char1(hipLaunchParm lp, - char1* a, const char1* bm, const char1* cm, int width, int height) + char1* a, const char1* bm, const char1* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -54,40 +54,40 @@ vectoradd_char1(hipLaunchParm lp, } } -__global__ void +__global__ void vectoradd_char2(hipLaunchParm lp, - char2* a, const char2* bm, const char2* cm, int width, int height) + char2* a, const char2* bm, const char2* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { a[i] = make_char2(bm[i].x, bm[i].y) + make_char2(cm[i].x, cm[i].y); } -} +} -__global__ void +__global__ void vectoradd_char3(hipLaunchParm lp, - char3* a, const char3* bm, const char3* cm, int width, int height) + char3* a, const char3* bm, const char3* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { a[i] = make_char3(bm[i].x, bm[i].y, bm[i].z) + make_char3(cm[i].x, cm[i].y, cm[i].z); } } -__global__ void +__global__ void vectoradd_char4(hipLaunchParm lp, - char4* a, const char4* bm, const char4* cm, int width, int height) + char4* a, const char4* bm, const char4* cm, int width, int height) { - int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x; - int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y; + int x = blockDim.x * blockIdx.x + threadIdx.x; + int y = blockDim.y * blockIdx.y + threadIdx.y; int i = y * width + x; if ( i < (width * height)) { @@ -99,7 +99,7 @@ vectoradd_char4(hipLaunchParm lp, #if 0 __kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) { - + int x = blockDimX * blockIdx.x + threadIdx.x; int y = blockDimY * blockIdy.y + threadIdx.y; @@ -128,21 +128,21 @@ bool dataTypesRun(){ hostA = (T*)malloc(NUM * sizeof(T)); hostB = (T*)malloc(NUM * sizeof(T)); hostC = (T*)malloc(NUM * sizeof(T)); - + // initialize the input data for (i = 0; i < NUM; i++) { hostB[i] = (T)i; hostC[i] = (T)i; } - + HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T))); - + HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), + hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, @@ -192,21 +192,21 @@ bool dataTypesRun(){ hostA = (T*)malloc(NUM * sizeof(T)); hostB = (T*)malloc(NUM * sizeof(T)); hostC = (T*)malloc(NUM * sizeof(T)); - + // initialize the input data for (i = 0; i < NUM; i++) { hostB[i] = (T)i; hostC[i] = (T)i; } - + HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T))); - + HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), + hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, @@ -256,21 +256,21 @@ bool dataTypesRun(){ hostA = (T*)malloc(NUM * sizeof(T)); hostB = (T*)malloc(NUM * sizeof(T)); hostC = (T*)malloc(NUM * sizeof(T)); - + // initialize the input data for (i = 0; i < NUM; i++) { hostB[i] = (T)i; hostC[i] = (T)i; } - + HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T))); - + HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), + hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, @@ -319,21 +319,21 @@ bool dataTypesRunChar4(){ hostA = (T*)malloc(NUM * sizeof(T)); hostB = (T*)malloc(NUM * sizeof(T)); hostC = (T*)malloc(NUM * sizeof(T)); - + // initialize the input data for (i = 0; i < NUM; i++) { hostB[i] = (T)i; hostC[i] = (T)i; } - + HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T))); HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T))); - + HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice)); HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice)); - hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), + hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1), dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y), dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y), 0, 0, @@ -368,7 +368,7 @@ bool dataTypesRunChar4(){ } int main() { - + hipDeviceProp_t devProp; hipGetDeviceProperties(&devProp, 0); cout << " System minor " << devProp.minor << endl; diff --git a/hipamd/tests/src/deviceLib/hip_trig.cpp b/hipamd/tests/src/deviceLib/hip_trig.cpp index 5ec28101f3..6ee8dc58ad 100644 --- a/hipamd/tests/src/deviceLib/hip_trig.cpp +++ b/hipamd/tests/src/deviceLib/hip_trig.cpp @@ -36,7 +36,7 @@ THE SOFTWARE. #define SIZE LEN<<2 __global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){ - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; sin_d[tid] = __sinf(In[tid]); cos_d[tid] = __cosf(In[tid]); tan_d[tid] = __tanf(In[tid]); diff --git a/hipamd/tests/src/experimental/xcompile/hHip.c b/hipamd/tests/src/experimental/xcompile/hHip.c index 2ac4ebc73e..17e7e9ecf6 100644 --- a/hipamd/tests/src/experimental/xcompile/hHip.c +++ b/hipamd/tests/src/experimental/xcompile/hHip.c @@ -29,7 +29,7 @@ THE SOFTWARE. __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx < len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp b/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp index 79a272aaf2..5dca6c1bca 100644 --- a/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp +++ b/hipamd/tests/src/experimental/xcompile/hipxxKer.cpp @@ -30,23 +30,29 @@ THE SOFTWARE. __global__ void Kern(hipLaunchParm lp, float *A) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; A[tx] += 1.0f; } int main() { - float *A, *Ad; + float A[len]; + float *Ad; + for(int i=0;i __global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len) { - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx < Len) { Cd[tx] = Ad[tx] + Bd[tx]; diff --git a/hipamd/tests/src/hipC.c b/hipamd/tests/src/hipC.c index 644df6c98f..efa03bb909 100644 --- a/hipamd/tests/src/hipC.c +++ b/hipamd/tests/src/hipC.c @@ -34,7 +34,7 @@ THE SOFTWARE. #define SIZE 1024*1024*sizeof(int) __global__ void Iter(hipLaunchParm lp, int *Ad){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx == 0){ for(int i=0;i(my_sdata); #endif - size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t tid = hipThreadIdx_x; + size_t gid = (blockIdx.x * blockDim.x + threadIdx.x); + size_t tid = threadIdx.x; // initialize dynamic shared memory if (tid < groupElements) { diff --git a/hipamd/tests/src/kernel/hipDynamicShared2.cpp b/hipamd/tests/src/kernel/hipDynamicShared2.cpp index 95e70a9956..4567ff6fc2 100644 --- a/hipamd/tests/src/kernel/hipDynamicShared2.cpp +++ b/hipamd/tests/src/kernel/hipDynamicShared2.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. __global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) { HIP_DYNAMIC_SHARED(float, sBd); - int tx = hipThreadIdx_x; + int tx = threadIdx.x; for(int i=0;i= 0); + HIPASSERT(minor >= 0); + } + passed(); +} diff --git a/hipamd/tests/src/runtimeApi/device/hipDeviceGetName.cpp b/hipamd/tests/src/runtimeApi/device/hipDeviceGetName.cpp new file mode 100644 index 0000000000..ba260ac2e5 --- /dev/null +++ b/hipamd/tests/src/runtimeApi/device/hipDeviceGetName.cpp @@ -0,0 +1,47 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* + * Conformance test for checking functionality of + * hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device); + */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + +#include "test_common.h" + +#define len 256 + +int main() +{ + int numDevices = 0; + char name[len]; + hipDevice_t device; + HIPCHECK(hipGetDeviceCount(&numDevices)); + for(int i=0;i +#include +#include"test_common.h" + +int main(){ + hipFuncCache_t cacheConfig; + void *func; + hipFuncSetCacheConfig(func, cacheConfig); + passed(); +} + diff --git a/hipamd/tests/src/runtimeApi/device/hipSetDevice.cpp b/hipamd/tests/src/runtimeApi/device/hipSetDevice.cpp new file mode 100644 index 0000000000..b1b7cac12d --- /dev/null +++ b/hipamd/tests/src/runtimeApi/device/hipSetDevice.cpp @@ -0,0 +1,36 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t EXCLUDE_HIP_PLATFORM + * HIT_END + */ + +#include "test_common.h" + +int main(){ + int numDevices = 0; + HIPCHECK(hipGetDeviceCount(&numDevices)); + for(int i=0;i0.0f); printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e)); e = hipEventElapsedTime(&t, stop, start); HIPCHECK_API(e, expectedStopError); - if (expectedStopError == hipSuccess) + if (expectedStopError == hipSuccess) assert (t<0.0f); printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e)); diff --git a/hipamd/tests/src/runtimeApi/memory/hipHostGetFlags.cpp b/hipamd/tests/src/runtimeApi/memory/hipHostGetFlags.cpp index 9fad60aec8..f9359fb5da 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipHostGetFlags.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipHostGetFlags.cpp @@ -33,7 +33,7 @@ THE SOFTWARE. #define SIZE LEN*sizeof(float) __global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +int tx = threadIdx.x + blockIdx.x * blockDim.x; Cd[tx] = Ad[tx] + Bd[tx]; } diff --git a/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp b/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp index 607e2a9f63..4210ef0654 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipHostMalloc.cpp @@ -33,13 +33,13 @@ #define SIZE LEN*sizeof(float) __global__ void Add(float *Ad, float *Bd, float *Cd){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; Cd[tx] = Ad[tx] + Bd[tx]; } __global__ void Set(int *Ad, int val){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; Ad[tx] = val; } @@ -52,13 +52,13 @@ std::vector syncMsg = {"event", "stream", "device"}; void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg) { - std::cerr << "test: CheckHostPointer " << msg + std::cerr << "test: CheckHostPointer " << msg //<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC //<< " HIP_EVENT_SYS_RELEASE=" << HIP_EVENT_SYS_RELEASE - << " eventFlags = " << std::hex << eventFlags - << ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "") - << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") - << " ptr=" << ptr + << " eventFlags = " << std::hex << eventFlags + << ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "") + << ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "") + << " ptr=" << ptr << " syncMethod=" << syncMsg[syncMethod] << "\n"; hipStream_t s; @@ -93,7 +93,7 @@ void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMe default: assert(0); }; - + for (int i=0; i __global__ void Inc(hipLaunchParm lp, float *Ad){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; Ad[tx] = Ad[tx] + float(1); } template -void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister) +void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister) { A = A + offset; numElements -= offset; @@ -56,7 +56,7 @@ void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internal HIPCHECK(hipMemset(Bd, 13.0f, sizeBytes)); - // + // HIPCHECK(hipMemcpy(Bd, A, sizeBytes, hipMemcpyHostToDevice)); HIPCHECK(hipMemcpy(Bh, Bd, sizeBytes, hipMemcpyDeviceToHost)); @@ -81,7 +81,7 @@ int main(int argc, char *argv[]) const size_t size = N * sizeof(float); - if (p_tests & 0x1) { + if (p_tests & 0x1) { float *A, **Ad; int num_devices; HIPCHECK(hipGetDeviceCount(&num_devices)); @@ -115,7 +115,7 @@ int main(int argc, char *argv[]) } - if (p_tests & 0x6) { + if (p_tests & 0x6) { // Sensitize HIP bug if device does not match where the memory was registered. HIPCHECK(hipSetDevice(0)); @@ -129,7 +129,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipMalloc(&Bd, size)); // TODO - set to 128 -#define OFFSETS_TO_TRY 128 +#define OFFSETS_TO_TRY 128 assert (N>OFFSETS_TO_TRY); if (p_tests & 0x2) { diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp index e8e803e44c..d8438fa848 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpy.cpp @@ -58,7 +58,7 @@ public: void offset(int offset) { _offset = offset; }; int offset() const { return _offset; }; - + private: T * _A_d; T* _B_d; @@ -72,7 +72,7 @@ private: template DeviceMemory::DeviceMemory(size_t numElements) - : _maxNumElements(numElements), + : _maxNumElements(numElements), _offset(0) { T ** np = nullptr; @@ -93,7 +93,7 @@ DeviceMemory::~DeviceMemory () HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0); HIPCHECK (hipFree(_C_dd)); - + _C_dd = NULL; }; @@ -125,7 +125,7 @@ public: T * A_hh; T* B_hh; - bool _usePinnedHost; + bool _usePinnedHost; private: size_t _maxNumElements; @@ -165,11 +165,11 @@ HostMemory::HostMemory(size_t numElements, bool usePinnedHost) template void -HostMemory::reset(size_t numElements, bool full) +HostMemory::reset(size_t numElements, bool full) { // Initialize the host data: for (size_t i=0; i void memcpytest2(DeviceMemory *dmem, HostMemory *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault) { size_t sizeElements = numElements * sizeof(T); - printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", - __func__, + printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n", + __func__, TYPENAME(T), sizeElements, sizeElements/1024.0/1024.0, hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault, @@ -273,8 +273,8 @@ void memcpytest2_for_type(size_t numElements) { printSep(); - DeviceMemory memD(numElements); - HostMemory memU(numElements, 0/*usePinnedHost*/); + DeviceMemory memD(numElements); + HostMemory memU(numElements, 0/*usePinnedHost*/); HostMemory memP(numElements, 1/*usePinnedHost*/); for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) { @@ -307,11 +307,11 @@ void memcpytest2_sizes(size_t maxElem=0) maxElem = free/sizeof(T)/8; } - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); for (size_t elem=1; elem<=maxElem; elem*=2) { @@ -336,11 +336,11 @@ void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets) HIPCHECK(hipMemGetInfo(&free, &total)); - printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", + printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n", deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0); HIPCHECK ( hipDeviceReset() ); - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); HostMemory memP(maxElem, 1/*usePinnedHost*/); size_t elem = maxElem / 2; @@ -380,16 +380,16 @@ void multiThread_1(bool serialize, bool usePinnedHost) { printSep(); printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost); - DeviceMemory memD(N); - HostMemory mem1(N, usePinnedHost); - HostMemory mem2(N, usePinnedHost); + DeviceMemory memD(N); + HostMemory mem1(N, usePinnedHost); + HostMemory mem2(N, usePinnedHost); std::thread t1 (memcpytest2, &memD, &mem1, N, 0,0,0); if (serialize) { t1.join(); } - + std::thread t2 (memcpytest2,&memD, &mem2, N, 0,0,0); if (serialize) { t2.join(); @@ -427,21 +427,21 @@ int main(int argc, char *argv[]) // Some tests around the 64KB boundary which have historically shown issues: printf ("\n\n=== tests&0x2 (64KB boundary)\n"); size_t maxElem = 32*1024*1024; - DeviceMemory memD(maxElem); - HostMemory memU(maxElem, 0/*usePinnedHost*/); - HostMemory memP(maxElem, 0/*usePinnedHost*/); + DeviceMemory memD(maxElem); + HostMemory memU(maxElem, 0/*usePinnedHost*/); + HostMemory memP(maxElem, 0/*usePinnedHost*/); // These all pass: - memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 15*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0); // Just over 64MB: - memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); - memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 17*1024*1024+1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memU, 32*1024*1024, 0, 0, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); + memcpytest2(&memD, &memP, 32*1024*1024, 1, 1, 0); } @@ -464,7 +464,7 @@ int main(int argc, char *argv[]) // Simplest cases: serialize the threads, and also used pinned memory: // This verifies that the sub-calls to memcpytest2 are correct. - multiThread_1(true, true); + multiThread_1(true, true); // Serialize, but use unpinned memory to stress the unpinned memory xfer path. multiThread_1(true, false); diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp index 22bd30689a..5cd46c808a 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyAsync.cpp @@ -59,7 +59,7 @@ struct HostTraits static const char *Name() { return "Pinned"; } ; static void *Alloc(size_t sizeBytes) { - void *p; + void *p; HIPCHECK(hipHostMalloc((void**)&p, sizeBytes, hipHostMallocDefault)); return p; }; @@ -67,11 +67,11 @@ struct HostTraits template -__global__ void +__global__ void addK (hipLaunchParm lp, T *A, T K, size_t numElements) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; for (size_t i=offset; i -void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) +void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide) { HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible. size_t Nbytes = numElements*sizeof(T); @@ -95,7 +95,7 @@ void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); printf ("-----------------------------------------------------------------------------------------------\n"); - printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n", + printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n", __func__, HostTraits::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes); T *A_h = NULL; @@ -176,7 +176,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies, size_t eachCopyBytes = eachCopyElements * sizeof(T); printf ("-----------------------------------------------------------------------------------------------\n"); - printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", + printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n", __func__, Nbytes, (double)(Nbytes)/1024.0/1024.0, numCopies, eachCopyElements, eachCopyBytes); T *A_d; @@ -194,7 +194,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies, //stream=0; // fixme TODO - for (int i=0; i 1) + { + HIPCHECK(hipSetDevice(0)); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + HIPCHECK(hipSetDevice(1)); + HIPCHECK(hipMalloc(&X_d,Nbytes)); + HIPCHECK(hipMalloc(&Y_d,Nbytes)); + HIPCHECK(hipMalloc(&Z_d,Nbytes)); + + + HIPCHECK(hipSetDevice(0)); + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipDeviceSynchronize()); + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + + HIPCHECK(hipSetDevice(1)); + HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes)); + HIPCHECK(hipMemcpyDtoD(Y_d, B_d, Nbytes)); + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes)); + HIPCHECK(hipDeviceSynchronize()); + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK(hipFree(X_d)); + HIPCHECK(hipFree(Y_d)); + HIPCHECK(hipFree(Z_d)); + } + + passed(); + +} + diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp new file mode 100644 index 0000000000..5c99b43564 --- /dev/null +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyDtoDAsync.cpp @@ -0,0 +1,84 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* + * Conformance test for checking functionality of + * hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId, size_t sizeBytes); + */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * RUN: %t + * HIT_END + */ + +#include "test_common.h" + +int main() +{ + hipDevice_t device; + size_t Nbytes = N*sizeof(int); + int numDevices = 0; + int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d; + int *A_h, *B_h, *C_h ; + hipStream_t s; + + HIPCHECK(hipGetDeviceCount(&numDevices)); + if(numDevices > 1) + { + HIPCHECK(hipSetDevice(0)); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + HIPCHECK(hipSetDevice(1)); + HIPCHECK(hipMalloc(&X_d,Nbytes)); + HIPCHECK(hipMalloc(&Y_d,Nbytes)); + HIPCHECK(hipMalloc(&Z_d,Nbytes)); + + + HIPCHECK(hipSetDevice(0)); + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipDeviceSynchronize()); + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HIPCHECK(hipStreamCreate(&s)); + HIPCHECK(hipSetDevice(1)); + HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s)); + HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s)); + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s)); + HIPCHECK(hipStreamSynchronize(s)); + HIPCHECK(hipDeviceSynchronize()); + + HipTest::checkVectorADD(A_h, B_h, C_h, N); + HIPCHECK(hipStreamDestroy(s)); + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK(hipFree(X_d)); + HIPCHECK(hipFree(Y_d)); + HIPCHECK(hipFree(Z_d)); + } + + passed(); + + +} + diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp new file mode 100644 index 0000000000..7e2fc2d3d0 --- /dev/null +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeer.cpp @@ -0,0 +1,80 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* + * Conformance test for checking functionality of + * hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId, size_t sizeBytes); + */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp + * RUN: %t + * HIT_END + */ + +#include "test_common.h" + +int main() +{ + hipDevice_t device; + size_t Nbytes = N*sizeof(int); + int numDevices = 0; + int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d; + int *A_h, *B_h, *C_h ; + + HIPCHECK(hipGetDeviceCount(&numDevices)); + if(numDevices > 1) + { + HIPCHECK(hipSetDevice(0)); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + HIPCHECK(hipSetDevice(1)); + HIPCHECK(hipMalloc(&X_d,Nbytes)); + HIPCHECK(hipMalloc(&Y_d,Nbytes)); + HIPCHECK(hipMalloc(&Z_d,Nbytes)); + + HIPCHECK(hipSetDevice(0)); + HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipDeviceSynchronize()); + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HIPCHECK(hipSetDevice(1)); + hipMemcpyPeer(X_d, 1, A_d, 0, Nbytes); //this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs. + hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes); + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost)); + HIPCHECK(hipDeviceSynchronize()); + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK(hipFree(X_d)); + HIPCHECK(hipFree(Y_d)); + HIPCHECK(hipFree(Z_d)); + } + passed(); + + + + +} + diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp new file mode 100644 index 0000000000..b01a0aeb1d --- /dev/null +++ b/hipamd/tests/src/runtimeApi/memory/hipMemcpyPeerAsync.cpp @@ -0,0 +1,85 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR +IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* + * Conformance test for checking functionality of + * hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId, size_t sizeBytes); + */ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * RUN: %t + * HIT_END + */ + +#include "test_common.h" + +int main() +{ + hipDevice_t device; + size_t Nbytes = N*sizeof(int); + int numDevices = 0; + int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d; + int *A_h, *B_h, *C_h ; + hipStream_t s; + + + HIPCHECK(hipGetDeviceCount(&numDevices)); + if(numDevices > 1) + { + HIPCHECK(hipSetDevice(0)); + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N); + HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false); + HIPCHECK(hipSetDevice(1)); + HIPCHECK(hipMalloc(&X_d,Nbytes)); + HIPCHECK(hipMalloc(&Y_d,Nbytes)); + HIPCHECK(hipMalloc(&Z_d,Nbytes)); + + + HIPCHECK(hipSetDevice(0)); + HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice)); + HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice)); + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N); + HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost)); + HIPCHECK (hipDeviceSynchronize()); + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HIPCHECK(hipStreamCreate(&s)); + HIPCHECK(hipSetDevice(1)); + HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s)); + HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s)); + + hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N); + HIPCHECK ( hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost)); + HIPCHECK (hipDeviceSynchronize()); + HIPCHECK (hipStreamSynchronize(s)); + HipTest::checkVectorADD(A_h, B_h, C_h, N); + + HIPCHECK(hipStreamDestroy(s)); + HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false); + HIPCHECK(hipFree(X_d)); + HIPCHECK(hipFree(Y_d)); + HIPCHECK(hipFree(Z_d)); + } + + passed(); + + +} + diff --git a/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp b/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp index 6042f538b3..667d4b404b 100644 --- a/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp +++ b/hipamd/tests/src/runtimeApi/memory/hipMemoryAllocateCoherent.cpp @@ -31,7 +31,7 @@ THE SOFTWARE. __global__ void Kernel(hipLaunchParm lp,volatile float* hostRes) { - int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tid = threadIdx.x + blockIdx.x * blockDim.x; hostRes[tid] = tid + 1; __threadfence_system(); // expecting that the data is getting flushed to host here! diff --git a/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp b/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp index 0c2c387c2f..81450f1fba 100644 --- a/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp +++ b/hipamd/tests/src/runtimeApi/memory/p2p_copy_coherency.cpp @@ -24,7 +24,7 @@ THE SOFTWARE. /* HIT_START * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 - * RUN: %t EXCLUDE_HIP_PLATFORM all + * RUN: %t EXCLUDE_HIP_PLATFORM all * HIT_END */ @@ -33,11 +33,13 @@ THE SOFTWARE. #ifdef __HIP_PLATFORM_HCC__ #include -#define USE_HCC_MEMTRACKER 0 #endif +#define USE_HCC_MEMTRACKER 0 /* Debug flag to show the memtracker periodically */ -int elementSizes[] = {16, 1024,524288}; + + +int elementSizes[] = {1, 16, 1024, 524288, 16*1000*1000}; int nSizes = sizeof(elementSizes) / sizeof(int); int enablePeers(int dev0, int dev1) @@ -57,26 +59,30 @@ int enablePeers(int dev0, int dev1) return 0; }; +// Set value of array to specified 32-bit integer: __global__ void -memsetIntKernel(/*hipLaunchParm lp,*/ int * ptr, const int val, size_t numElements) +memsetIntKernel(int * ptr, const int val, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ ptr[i] = val; } }; __global__ void -memcpyIntKernel(/*hipLaunchParm lp, */const int * src, int* dst, size_t numElements) +memcpyIntKernel(const int * src, int* dst, size_t numElements) { - int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - int stride = hipBlockDim_x * hipGridDim_x ; + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; for (size_t i= gid; i< numElements; i+=stride){ dst[i] = src[i]; } }; + +// CHeck arrays in reverse order, to more easily detect cases where +// the copy is "partially" done. void checkReverse(const int *ptr, int numElements, int expected) { for (int i=numElements-1; i>=0; i--) { if (ptr[i] != expected) { @@ -88,7 +94,8 @@ void checkReverse(const int *ptr, int numElements, int expected) { printf ("test: OK\n"); } -void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, + +void runTestImpl(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements, int * dataGpu0_0, int * dataGpu0_1, int *dataGpu1, int *dataHost, int expected) { hipEvent_t e; @@ -96,7 +103,7 @@ void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_ HIPCHECK(hipEventCreateWithFlags(&e,0)); } const size_t sizeElements = numElements * sizeof(int); - printf ("test: runTest with %zu bytes %s with hostSync %s\n", sizeElements, stepAIsCopy ? "copy" : "kernel", hostSync ? "enabled" : "disabled"); + printf ("test: runTestImpl with %zu bytes %s with hostSync %s\n", sizeElements, stepAIsCopy ? "copy" : "kernel", hostSync ? "enabled" : "disabled"); hipStream_t stepAStream = gpu0Stream; @@ -127,9 +134,12 @@ void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_ HIPCHECK(hipStreamSynchronize(gpu0Stream)); checkReverse(dataHost, numElements, expected); + if(!hostSync) { + HIPCHECK(hipEventDestroy(e)); + } } -void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync, bool useMemcpy) +void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync) { const size_t sizeElements = numElements * sizeof(int); @@ -163,12 +173,15 @@ void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync, bool useMe #endif printf (" test: init complete\n"); - runTest(useMemcpy , hostSync, gpu0Stream, gpu1Stream, numElements, dataGpu0_0,dataGpu0_1, dataGpu1, dataHost, expected); + runTestImpl(true, hostSync, gpu0Stream, gpu1Stream, numElements, dataGpu0_0,dataGpu0_1, dataGpu1, dataHost, expected); HIPCHECK(hipFree(dataGpu0_0)); HIPCHECK(hipFree(dataGpu0_1)); HIPCHECK(hipFree(dataGpu1)); HIPCHECK(hipHostFree(dataHost)); + + HIPCHECK(hipStreamDestroy(gpu0Stream)); + HIPCHECK(hipStreamDestroy(gpu1Stream)); }; int main(int argc, char *argv[]) @@ -192,11 +205,9 @@ int main(int argc, char *argv[]) return -1; }; - for(int index = 1;index < nSizes;index++) { - testMultiGpu(dev0, dev1, elementSizes[index] , false /* GPU Synchronization*/, true); - testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/, true); - testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/, false); - testMultiGpu(dev0, dev1, elementSizes[index] , false /*Host Synchronization*/, false); + for(int index = 0;index < nSizes;index++) { + testMultiGpu(dev0, dev1, elementSizes[index] , false /*GPU Synchronization*/); + testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/); } diff --git a/hipamd/tests/src/runtimeApi/module/hipModule.cpp b/hipamd/tests/src/runtimeApi/module/hipModule.cpp index 1b7b62cff2..f2c2137738 100644 --- a/hipamd/tests/src/runtimeApi/module/hipModule.cpp +++ b/hipamd/tests/src/runtimeApi/module/hipModule.cpp @@ -34,7 +34,7 @@ THE SOFTWARE. #define kernel_name "hello_world" __global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){ - int tx = hipThreadIdx_x; + int tx = threadIdx.x; Bd[tx] = Ad[tx]; } diff --git a/hipamd/tests/src/runtimeApi/module/vcpy_kernel.cpp b/hipamd/tests/src/runtimeApi/module/vcpy_kernel.cpp index 0375eee342..7ee1ad333b 100644 --- a/hipamd/tests/src/runtimeApi/module/vcpy_kernel.cpp +++ b/hipamd/tests/src/runtimeApi/module/vcpy_kernel.cpp @@ -24,7 +24,7 @@ THE SOFTWARE. extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b) { - int tx = hipThreadIdx_x; + int tx = threadIdx.x; b[tx] = a[tx]; } diff --git a/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp b/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp index 4f73b67ad7..9b2e749cf9 100644 --- a/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp +++ b/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams1.cpp @@ -41,8 +41,8 @@ void printSep() // Designed to stress a small number of simple smoke tests template< - typename T=float, - class P=HipTest::Unpinned, + typename T=float, + class P=HipTest::Unpinned, class C=HipTest::Memcpy > void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream) diff --git a/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp b/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp index 43a3e9bdea..3727901645 100644 --- a/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp +++ b/hipamd/tests/src/runtimeApi/multiThread/hipMultiThreadStreams2.cpp @@ -35,7 +35,7 @@ THE SOFTWARE. template __global__ void Inc(hipLaunchParm lp, T *Array){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +int tx = threadIdx.x + blockIdx.x * blockDim.x; Array[tx] = Array[tx] + T(1); } @@ -116,7 +116,7 @@ int main(int argc, char **argv) } const size_t size = N * sizeof(float); - + for (int i=0; i< iterations; i++) { std::thread t1(run1, size, stream[0]); @@ -126,7 +126,7 @@ int main(int argc, char **argv) // std::cout<<"T1"<::reset() { HipTest::setDefaultData(_numElements, _A_h, _B_h, _C_h); H2D(); - + } @@ -238,7 +238,7 @@ int main(int argc, char *argv[]) nullStreamer->D2H(); HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } @@ -263,7 +263,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); + HipTest::checkTest(expected_H, nullStreamer->_C_h, numElements); } } @@ -289,10 +289,10 @@ int main(int argc, char *argv[]) // Copy with stream1, this could go async if the streamSync doesn't synchronize ALL the streams. HIPCHECK(hipMemcpyAsync(streamers[0]->_C_h, streamers[0]->_C_d, streamers[0]->_numElements*sizeof(int), hipMemcpyDeviceToHost, streamers[1]->_stream)); - + HIPCHECK(hipDeviceSynchronize()); - HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); + HipTest::checkTest(expected_H, streamers[0]->_C_h, numElements); } diff --git a/hipamd/tests/src/runtimeApi/stream/hipStream.h b/hipamd/tests/src/runtimeApi/stream/hipStream.h index 6468667703..0ce06bbc3f 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStream.h +++ b/hipamd/tests/src/runtimeApi/stream/hipStream.h @@ -73,7 +73,7 @@ void D2H(T *Dst, T *Src, size_t size){ template __global__ void Inc(hipLaunchParm lp, T *In){ -int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; +int tx = threadIdx.x + blockIdx.x * blockDim.x; In[tx] = In[tx] + 1; } @@ -94,12 +94,12 @@ void initArrays(T **Ad, T **Ah, } template -void initArrays(T **Ad, size_t N, - bool deviceMemory = false, +void initArrays(T **Ad, size_t N, + bool deviceMemory = false, bool usePinnedHost = false){ size_t NBytes = N * sizeof(T); if(deviceMemory){ - HIPCHECK( hipMalloc(Ad, NBytes)); + HIPCHECK( hipMalloc(Ad, NBytes)); }else{ if(usePinnedHost){ HIPCHECK(hipHostMalloc((void**)Ad, NBytes, hipHostMallocDefault)); diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamGetFlags.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamGetFlags.cpp new file mode 100644 index 0000000000..9212c70e7f --- /dev/null +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamGetFlags.cpp @@ -0,0 +1,44 @@ +/* +Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved. +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* HIT_START + * BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc + * RUN: %t + * HIT_END + */ + +#include "test_common.h" + + +int main(int argc, char *argv[]) +{ + hipStream_t stream; + unsigned int flags; + HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamDefault)); + HIPCHECK(hipStreamGetFlags(stream, &flags)); + HIPASSERT(flags == 0); + HIPCHECK(hipStreamDestroy(stream)); + + HIPCHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + HIPCHECK(hipStreamGetFlags(stream, &flags)); + HIPASSERT(flags == 1); + HIPCHECK(hipStreamDestroy(stream)); + + passed(); +} diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp index c6a58ce7d4..4c49d80c05 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamSync2.cpp @@ -59,23 +59,23 @@ const char *syncModeString(int syncMode) { void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode syncMode, bool expectMismatch) { - // This test sends a long-running kernel to the null stream, then tests to see if the + // This test sends a long-running kernel to the null stream, then tests to see if the // specified synchronization technique is effective. // - // Some syncMode are not expected to correctly sync (for example "syncNone"). in these + // Some syncMode are not expected to correctly sync (for example "syncNone"). in these // cases the test sets expectMismatch and the check logic below will attempt to ensure that // the undesired synchronization did not occur - ie ensure the kernel is still running and did // not yet update the stop event. This can be tricky since if the kernel runs fast enough it - // may complete before the check. To prevent this, the addCountReverse has a count parameter - // which causes it to loop repeatedly, and the results are checked in reverse order. + // may complete before the check. To prevent this, the addCountReverse has a count parameter + // which causes it to loop repeatedly, and the results are checked in reverse order. // // Tests with expectMismatch=true should ensure the kernel finishes correctly. This results // are checked and we test to make sure stop event has completed. - + if (!(testMask & p_tests)) { return; } - printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", + printf ("\ntest 0x%02x: syncMode=%s expectMismatch=%d\n", testMask, syncModeString(syncMode), expectMismatch); size_t sizeBytes = numElements * sizeof(int); @@ -98,7 +98,7 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); // Launch kernel into null stream, should result in C_h == count. hipLaunchKernelGGL(HipTest::addCountReverse , dim3(blocks), dim3(threadsPerBlock), 0, 0 /*stream*/, C_d, C_h, numElements, count); - HIPCHECK(hipEventRecord(stop, 0/*default*/)); + HIPCHECK(hipEventRecord(stop, 0/*default*/)); switch (syncMode) { case syncNone: @@ -108,18 +108,18 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, SyncMode s break; case syncOtherStream: // Does this synchronize with the null stream? - HIPCHECK(hipStreamSynchronize(otherStream)); + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncMarkerThenOtherStream: case syncMarkerThenOtherNonBlockingStream: - - // this may wait for NULL stream depending hipStreamNonBlocking flag above - HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); - HIPCHECK(hipStreamSynchronize(otherStream)); + // this may wait for NULL stream depending hipStreamNonBlocking flag above + HIPCHECK(hipEventRecord(otherStreamEvent, otherStream)); + + HIPCHECK(hipStreamSynchronize(otherStream)); break; case syncDevice: - HIPCHECK(hipDeviceSynchronize()); + HIPCHECK(hipDeviceSynchronize()); break; default: assert(0); @@ -197,7 +197,7 @@ void runTests(int64_t numElements) int main(int argc, char *argv[]) { // Can' destroy the default stream:// TODO - move to another test - HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); + HIPCHECK_API(hipStreamDestroy(0), hipErrorInvalidResourceHandle); HipTest::parseStandardArguments(argc, argv, true /*failOnUndefinedArg*/); diff --git a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp index f5b1b79550..cf463be76a 100644 --- a/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp +++ b/hipamd/tests/src/runtimeApi/stream/hipStreamWaitEvent.cpp @@ -88,7 +88,7 @@ private: template Streamer::Streamer(int deviceId, T * A_d, size_t numElements, int commandType) : - _preA_d(NULL), + _preA_d(NULL), _A_d(A_d), _deviceId(deviceId), _numElements(numElements), @@ -239,7 +239,7 @@ size_t Streamer::check(int streamerNum, T initValue, T expectedOffset, bool e return _mismatchCount; } - + //--- //Parse arguments specific to this test. @@ -300,7 +300,7 @@ void checkAll(int initValue, std::vector &streamers, std::vector< for (int i=0; iexpectedAdd(); - + mismatchCount += streamers[i]->check(i+1, initValue, expected, expectPass); } @@ -330,7 +330,7 @@ void checkAll(int initValue, std::vector &streamers, std::vector< void sync_none(void) {}; -void sync_allDevices(int numDevices) +void sync_allDevices(int numDevices) { for (int d=0; d streamers) +void sync_queryAllUntilComplete(std::vector streamers) { for (int i=streamers.size()-1; i>=0; i--) { streamers[i]->queryUntilComplete(); @@ -347,7 +347,7 @@ void sync_queryAllUntilComplete(std::vector streamers) } -void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) +void sync_streamWaitEvent(hipEvent_t lastEvent, int sideDeviceId, hipStream_t sideStream, bool waitHere) { HIPCHECK(hipSetDevice(sideDeviceId)); @@ -389,7 +389,7 @@ int main(int argc, char *argv[]) initArray_h[i] = initValue; } HIPCHECK(hipMemcpy(initArray_d, initArray_h, sizeElements, hipMemcpyHostToDevice)); - + int numDevices; HIPCHECK(hipGetDeviceCount(&numDevices)); @@ -414,7 +414,7 @@ int main(int argc, char *argv[]) // A sideband stream channel that is independent from above. - // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is + // Used to check to ensure the WaitEvent or other synchronization is working correctly since by default sideStream is // asynchronous wrt the other streams. std::vector sideStreams; for (int d=0; d Test 0x1000 simple null stream tests\n"); + printf ("==> Test 0x1000 simple null stream tests\n"); // try some null stream: hipStreamQuery(0); @@ -463,7 +463,7 @@ int main(int argc, char *argv[]) HIPCHECK(hipEventRecord(e1, s1)) HIPCHECK(hipStreamWaitEvent(hipStream_t(0), e1, 0/*flags*/)); - + HIPCHECK(hipStreamDestroy(s1)); HIPCHECK(hipEventDestroy(e1)); } @@ -476,11 +476,11 @@ int main(int argc, char *argv[]) HIPCHECK(hipEventRecord(e1, hipStream_t(0))) HIPCHECK(hipStreamWaitEvent(s1, e1, 0/*flags*/)); - + HIPCHECK(hipStreamDestroy(s1)); HIPCHECK(hipEventDestroy(e1)); } - + } diff --git a/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp b/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp new file mode 100644 index 0000000000..b2a66f61e2 --- /dev/null +++ b/hipamd/tests/src/runtimeApi/synchronization/copy_coherency.cpp @@ -0,0 +1,391 @@ +/* +Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +//ROCM_TARGET=gfx900 hipcc --genco memcpyInt.device.cpp -o memcpyInt.hsaco +//hipcc copy_coherency.cpp -I ~/X/HIP/tests/src/ ~/X/HIP/tests/src/test_common.cpp + + +// TODO - add code object support here. +/* HIT_START + * BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11 + * RUN: %t + * HIT_END + */ + + +// Test cache management (fences) and synchronization between kernel and copy commands. +// Exhaustively tests 3 command types (copy, kernel, module kernel), +// many sync types (see SyncType), followed by another command, across a sweep +// of data sizes designed to stress various levels of the memory hierarchy. + +#include "hip/hip_runtime.h" +#include "test_common.h" + +// TODO - turn this back on when test infra can copy the module files to use as test inputs. +#define SKIP_MODULE_KERNEL 1 + + +class MemcpyFunction +{ +public: + MemcpyFunction(const char *fileName, const char *functionName) { load(fileName, functionName); }; + void load(const char *fileName, const char *functionName); + void launch(int * dst, const int * src, size_t numElements, hipStream_t s); + +private: + hipFunction_t _function; + hipModule_t _module; +}; + + +void MemcpyFunction::load(const char *fileName, const char *functionName) +{ +#if SKIP_MODULE_KERNEL!=1 + HIPCHECK(hipModuleLoad(&_module, fileName)); + HIPCHECK(hipModuleGetFunction(&_function, _module, functionName)); +#endif +}; + + +void MemcpyFunction::launch(int * dst, const int * src, size_t numElements, hipStream_t s) +{ + struct { + uint32_t _hidden[6]; + int* _dst; + const int* _src; + size_t _numElements; + } args; + + args._dst = dst; + args._src = src; + args._numElements = numElements; + + size_t size = sizeof(args); + void *config[] = { + HIP_LAUNCH_PARAM_BUFFER_POINTER, &args, + HIP_LAUNCH_PARAM_BUFFER_SIZE, &size, + HIP_LAUNCH_PARAM_END + }; + + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + HIPCHECK(hipModuleLaunchKernel(_function, blocks, 1, 1, threadsPerBlock, 1, 1, 0/*dynamicShared*/, s, NULL, (void**)&config)); +}; + +bool g_warnOnFail = true; +//int g_elementSizes[] = {1, 16, 1024, 524288, 16*1000*1000}; // TODO +int g_elementSizes[] = {128*1000, 256*1000, 16*1000*1000}; + +MemcpyFunction g_moduleMemcpy("memcpyInt.hsaco", "memcpyIntKernel"); + + + +// Set value of array to specified 32-bit integer: +__global__ void +memsetIntKernel(int * ptr, const int val, size_t numElements) +{ + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; + for (size_t i= gid; i< numElements; i+=stride){ + ptr[i] = val; + } +}; + +__global__ void +memcpyIntKernel(int *dst, const int * src, size_t numElements) +{ + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; + for (size_t i= gid; i< numElements; i+=stride){ + dst[i] = src[i]; + } +}; + + +// CHeck arrays in reverse order, to more easily detect cases where +// the copy is "partially" done. +void checkReverse(const int *ptr, int numElements, int expected) { + int mismatchCnt = 0; + for (int i=numElements-1; i>=0; i--) { + if (ptr[i] != expected) { + fprintf (stderr, "%s**error: i=%d, ptr[i] == (%x) , does not equal expected (%x)\n%s", KRED, i, ptr[i], expected, KNRM); + if (!g_warnOnFail) { + assert (ptr[i] == expected); + } + if (++mismatchCnt >= 10) { + break; + } + } + } + + fprintf (stderr, "test: OK\n"); +} + +#define ENUM_CASE_STR(x) case x: return #x + +enum CmdType { + COPY, + KERNEL, + MODULE_KERNEL, + MAX_CmdType +}; + + +const char * CmdTypeStr(CmdType c) +{ + switch(c) { + ENUM_CASE_STR(COPY); + ENUM_CASE_STR(KERNEL); + ENUM_CASE_STR(MODULE_KERNEL); + default: return "UNKNOWN"; + }; +} + + +enum SyncType { + NONE, + EVENT_QUERY, + EVENT_SYNC, + STREAM_WAIT_EVENT, + STREAM_QUERY, + STREAM_SYNC, + DEVICE_SYNC, + MAX_SyncType +}; + + +const char * SyncTypeStr(SyncType s) +{ + switch(s) { + ENUM_CASE_STR(NONE); + ENUM_CASE_STR(EVENT_QUERY); + ENUM_CASE_STR(EVENT_SYNC); + ENUM_CASE_STR(STREAM_WAIT_EVENT); + ENUM_CASE_STR(STREAM_QUERY); + ENUM_CASE_STR(STREAM_SYNC); + ENUM_CASE_STR(DEVICE_SYNC); + default: return "UNKNOWN"; + }; +}; + + +void runCmd(CmdType cmd, int *dst, const int *src, hipStream_t s, size_t numElements) +{ + switch (cmd) { + case COPY: + HIPCHECK(hipMemcpyAsync(dst, src, numElements*sizeof(int), hipMemcpyDeviceToDevice, s)); + break; + case KERNEL: + { + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + hipLaunchKernelGGL(memcpyIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, s, + dst, src, numElements); + } + break; + case MODULE_KERNEL: + g_moduleMemcpy.launch(dst, src, numElements, s); + break; + default: + failed("unknown cmd=%d type", cmd); + }; +} + +void resetInputs( int * Ad, int * Bd, int *Cd, int *Ch, size_t numElements, int expected) +{ + unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, hipStream_t(0), + Ad, expected, numElements); + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, hipStream_t(0), + Bd, 0xDEADBEEF, numElements); // poison with bad value to ensure is overwritten correctly + hipLaunchKernelGGL(memsetIntKernel, dim3(blocks), dim3(threadsPerBlock), 0, hipStream_t(0), + Bd, 0xF000BA55, numElements); // poison with bad value to ensure is overwritten correctly + memset(Ch, 13, numElements*sizeof(int)); // poison with bad value to ensure is overwritten correctly + HIPCHECK(hipDeviceSynchronize()); +} + +// Intended to test proper synchronization and cache flushing between CMDA and CMDB. +// CMD are of type CmdType. All command copy memory, using either hipMemcpyAsync or kernel implementations. +// CmdA copies from Ad to Bd, +// Some form of synchronization is applied. +// Then cmdB copies from Bd to Cd. +// +// Cd is then copied to host Ch using a memory copy. +// +// Correct result at the end is that Ch contains the contents originally in Ad (integer 0x42) +void runTestImpl(CmdType cmdAType, SyncType syncType, CmdType cmdBType, + hipStream_t stream1, hipStream_t stream2, int numElements, + int * Ad, int * Bd, int *Cd, int *Ch, + int expected) +{ + hipEvent_t e; + HIPCHECK(hipEventCreateWithFlags(&e,0)); + + resetInputs(Ad, Bd, Cd, Ch, numElements, expected); + + const size_t sizeElements = numElements * sizeof(int); + fprintf (stderr, "test: runTest with %zu bytes (%6.2f MB) cmdA=%s; sync=%s; cmdB=%s\n", + sizeElements, (double) (sizeElements/1024.0), CmdTypeStr(cmdAType), SyncTypeStr(syncType), CmdTypeStr(cmdBType)); + + if (SKIP_MODULE_KERNEL && ((cmdAType == MODULE_KERNEL) || (cmdBType == MODULE_KERNEL))) { + fprintf (stderr, "warn: skipping since test infra does not yet support modules\n"); + return; + } + + + // Step A: + runCmd(cmdAType, Bd, Ad, stream1, numElements); + + + // Sync in-between? + switch (syncType) { + case NONE: + break; + case EVENT_QUERY: + { + hipError_t st = hipErrorNotReady; + HIPCHECK(hipEventRecord(e, stream1)); + do { + st = hipEventQuery(e); + } while (st == hipErrorNotReady); + HIPCHECK(st); + } + break; + case EVENT_SYNC: + HIPCHECK(hipEventRecord(e, stream1)); + HIPCHECK(hipEventSynchronize(e)); + break; + case STREAM_WAIT_EVENT: + HIPCHECK(hipEventRecord(e, stream1)); + HIPCHECK(hipStreamWaitEvent(stream2, e, 0)); + break; + case STREAM_QUERY: + { + hipError_t st = hipErrorNotReady; + do { + st = hipStreamQuery(stream1); + } while (st == hipErrorNotReady); + HIPCHECK(st); + } + break; + case STREAM_SYNC: + HIPCHECK(hipStreamSynchronize(stream1)); + break; + case DEVICE_SYNC: + HIPCHECK(hipDeviceSynchronize()); + break; + default: + fprintf(stderr, "warning: unknown sync type=%s", SyncTypeStr(syncType)); + return; // FIXME, this doesn't clean up + //failed("unknown sync type=%s", SyncTypeStr(syncType)); + }; + + + runCmd(cmdBType, Cd, Bd, stream2, numElements); + + + // Copy back to host, use async copy to avoid any extra synchronization that might mask issues. + HIPCHECK(hipMemcpyAsync(Ch, Cd, sizeElements, hipMemcpyDeviceToHost, stream2)); + HIPCHECK(hipStreamSynchronize(stream2)); + + checkReverse(Ch, numElements, expected); + + HIPCHECK(hipEventDestroy(e)); +}; + + +void testWrapper(size_t numElements) +{ + const size_t sizeElements = numElements * sizeof(int); + const int expected = 0x42; + int * Ad, * Bd, *Cd, *Ch; + + HIPCHECK(hipMalloc(&Ad, sizeElements)); + HIPCHECK(hipMalloc(&Bd, sizeElements)); + HIPCHECK(hipMalloc(&Cd, sizeElements)); + HIPCHECK(hipHostMalloc(&Ch, sizeElements)); // Ch is the end array + + + + hipStream_t stream1, stream2; + + HIPCHECK(hipStreamCreate(&stream1)); + HIPCHECK(hipStreamCreate(&stream2)); + + + HIPCHECK(hipDeviceSynchronize()); + fprintf (stderr, "test: init complete, start running tests\n"); + + + runTestImpl(COPY, EVENT_SYNC, KERNEL, stream1, stream2, numElements, Ad, Bd, Cd, Ch, expected); + + for (int cmdA=0; cmdA + + + +extern "C" __global__ void +memcpyIntKernel(hipLaunchParm lp, int *dst, const int * src, size_t numElements) +{ + int gid = (blockIdx.x * blockDim.x + threadIdx.x); + int stride = blockDim.x * gridDim.x ; + for (size_t i= gid; i< numElements; i+=stride){ + dst[i] = src[i]; + } +}; diff --git a/hipamd/tests/src/stress/hipStressAsync.cpp b/hipamd/tests/src/stress/hipStressAsync.cpp index e06e16809c..a142b41730 100644 --- a/hipamd/tests/src/stress/hipStressAsync.cpp +++ b/hipamd/tests/src/stress/hipStressAsync.cpp @@ -30,7 +30,7 @@ THE SOFTWARE. #define ITER 1<<10 __global__ void Iter(hipLaunchParm lp, int *Ad, int num){ - int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x; + int tx = threadIdx.x + blockIdx.x * blockDim.x; if(tx == 0){ for(int i = 0; i #include "hip/hip_runtime.h" -#include "hip/hip_texture_types.h" #include "hip/hip_runtime_api.h" #define HC __attribute__((hc)) @@ -137,8 +136,8 @@ vectorADD(hipLaunchParm lp, T *C_d, size_t NELEM) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; for (size_t i=offset; i=0; i-=stride) { C_d[i] = A_d[i] + B_d[i]; @@ -170,8 +169,8 @@ addCount( const T *A_d, size_t NELEM, int count) { - size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x); - size_t stride = hipBlockDim_x * hipGridDim_x ; + size_t offset = (blockIdx.x * blockDim.x + threadIdx.x); + size_t stride = blockDim.x * gridDim.x ; // Deliberately do this in an inefficient way to increase kernel runtime for (int i=0; i=0; i-=stride) { C_d[i] = val; @@ -220,12 +219,12 @@ void setDefaultData(size_t numElements, T *A_h, T* B_h, T *C_h) { // Initialize the host data: for (size_t i=0; i void initArrays(T **A_d, T **B_d, T **C_d, - T **A_h, T **B_h, T **C_h, - size_t N, bool usePinnedHost=false) + T **A_h, T **B_h, T **C_h, + size_t N, bool usePinnedHost=false) { size_t Nbytes = N*sizeof(T); @@ -318,7 +317,7 @@ void freeArraysForHost(T *A_h, T *B_h, T *C_h, bool usePinnedHost) template void freeArrays(T *A_d, T *B_d, T *C_d, - T *A_h, T *B_h, T *C_h, bool usePinnedHost) + T *A_h, T *B_h, T *C_h, bool usePinnedHost) { if (A_d) { HIPCHECK ( hipFree(A_d) ); @@ -454,9 +453,9 @@ struct Pinned { static const bool isPinned = true; static const char *str() { return "Pinned"; }; - static void *Alloc(size_t sizeBytes) + static void *Alloc(size_t sizeBytes) { - void *p; + void *p; HIPCHECK(hipHostMalloc((void**)&p, sizeBytes)); return p; }; @@ -464,12 +463,12 @@ struct Pinned { //--- -struct Unpinned +struct Unpinned { static const bool isPinned = false; static const char *str() { return "Unpinned"; }; - static void *Alloc(size_t sizeBytes) + static void *Alloc(size_t sizeBytes) { void *p = malloc (sizeBytes); HIPASSERT(p); @@ -497,7 +496,7 @@ template<> struct MemTraits { - static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) + static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { HIPCHECK(hipMemcpy(dest, src, sizeBytes, kind)); } @@ -508,7 +507,7 @@ template<> struct MemTraits { - static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) + static void Copy(void *dest, const void *src, size_t sizeBytes, hipMemcpyKind kind, hipStream_t stream) { HIPCHECK(hipMemcpyAsync(dest, src, sizeBytes, kind, stream)); } diff --git a/hipamd/tests/src/texture/hipTextureObj2D.cpp b/hipamd/tests/src/texture/hipTextureObj2D.cpp index 443d708418..9ddafd6b1c 100644 --- a/hipamd/tests/src/texture/hipTextureObj2D.cpp +++ b/hipamd/tests/src/texture/hipTextureObj2D.cpp @@ -17,8 +17,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; - int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; + int x = blockIdx.x*blockDim.x + threadIdx.x; + int y = blockIdx.y*blockDim.y + threadIdx.y; outputData[y*width + x] = tex2D(textureObject, x, y); } diff --git a/hipamd/tests/src/texture/hipTextureRef2D.cpp b/hipamd/tests/src/texture/hipTextureRef2D.cpp index eb27b23230..4430ca722d 100644 --- a/hipamd/tests/src/texture/hipTextureRef2D.cpp +++ b/hipamd/tests/src/texture/hipTextureRef2D.cpp @@ -18,8 +18,8 @@ __global__ void tex2DKernel(float* outputData, int width, int height) { - int x = hipBlockIdx_x*hipBlockDim_x + hipThreadIdx_x; - int y = hipBlockIdx_y*hipBlockDim_y + hipThreadIdx_y; + int x = blockIdx.x*blockDim.x + threadIdx.x; + int y = blockIdx.y*blockDim.y + threadIdx.y; #ifdef __HIP_PLATFORM_HCC__ outputData[y*width + x] = tex2D(tex, textureObject, x, y); #else