このコミットが含まれているのは:
Evgeny Mankov
2017-11-27 17:33:48 +03:00
コミット f55bfb60f3
116個のファイルの変更3310行の追加1477行の削除
+4 -2
ファイルの表示
@@ -127,7 +127,9 @@ if ($HIP_PLATFORM eq "hcc") {
# Force -stdlib=libc++ on UB14.04
$HOST_OSVER= `cat /etc/os-release | grep "^VERSION_ID\=" | cut -d= -f2 | tr -d '\n'`;
if ($HOST_OSNAME eq "ubuntu" and $HOST_OSVER eq "\"14.04\"") {
if (($HOST_OSNAME eq "ubuntu" and $HOST_OSVER eq "\"14.04\"")
or ($HOST_OSNAME eq "\"centos\"" and $HOST_OSVER eq "\"7\"")
or ($HOST_OSNAME eq "\"rhel\"" and $HOST_OSVER eq "\"7.4\"")) {
$HIPCXXFLAGS .= " -stdlib=libc++";
$setStdLib = 1;
}
@@ -136,7 +138,6 @@ if ($HIP_PLATFORM eq "hcc") {
$HIPCXXFLAGS .= " -I$HSA_PATH/include";
$HIPCXXFLAGS .= " -Wno-deprecated-register";
$HIPLDFLAGS .= " -lsupc++";
$HIPLDFLAGS .= " -L$HSA_PATH/lib -L$ROCM_PATH/lib -lhsa-runtime64 -lhc_am -lhsakmt ";
# $HIPLDFLAGS .= " -L$HCC_HOME/compiler/lib -lLLVMAMDGPUDesc -lLLVMAMDGPUUtils -lLLVMMC -lLLVMCore -lLLVMSupport ";
@@ -438,6 +439,7 @@ if($HIP_PLATFORM eq "hcc"){
if ($target_gfx900 eq 1) {
$HIPLDFLAGS .= " --amdgpu-target=gfx900";
$HIPCXXFLAGS .= " -D__HIP_ARCH_GFX900__=1 ";
$ENV{HCC_EXTRA_LIBRARIES_GFX900}="$HIP_PATH/lib/hip_hc_gfx803.ll\n";
}
}
+40
ファイルの表示
@@ -0,0 +1,40 @@
#include "ArgParse.h"
cl::OptionCategory ToolTemplateCategory("CUDA to HIP source translator options");
cl::opt<std::string> OutputFilename("o",
cl::desc("Output filename"),
cl::value_desc("filename"),
cl::cat(ToolTemplateCategory));
cl::opt<bool> Inplace("inplace",
cl::desc("Modify input file inplace, replacing input with hipified output, save backup in .prehip file"),
cl::value_desc("inplace"),
cl::cat(ToolTemplateCategory));
cl::opt<bool> NoBackup("no-backup",
cl::desc("Don't create a backup file for the hipified source"),
cl::value_desc("no-backup"),
cl::cat(ToolTemplateCategory));
cl::opt<bool> NoOutput("no-output",
cl::desc("Don't write any translated output to stdout"),
cl::value_desc("no-output"),
cl::cat(ToolTemplateCategory));
cl::opt<bool> PrintStats("print-stats",
cl::desc("Print translation statistics"),
cl::value_desc("print-stats"),
cl::cat(ToolTemplateCategory));
cl::opt<std::string> OutputStatsFilename("o-stats",
cl::desc("Output filename for statistics"),
cl::value_desc("filename"),
cl::cat(ToolTemplateCategory));
cl::opt<bool> Examine("examine",
cl::desc("Combines -no-output and -print-stats options"),
cl::value_desc("examine"),
cl::cat(ToolTemplateCategory));
cl::extrahelp CommonHelp(ct::CommonOptionsParser::HelpMessage);
+19
ファイルの表示
@@ -0,0 +1,19 @@
#pragma once
#include "clang/Tooling/CommonOptionsParser.h"
#include "llvm/Support/CommandLine.h"
namespace cl = llvm::cl;
namespace ct = clang::tooling;
extern cl::OptionCategory ToolTemplateCategory;
extern cl::opt<std::string> OutputFilename;
extern cl::opt<bool> Inplace;
extern cl::opt<bool> NoBackup;
extern cl::opt<bool> NoOutput;
extern cl::opt<bool> PrintStats;
extern cl::opt<std::string> OutputStatsFilename;
extern cl::opt<bool> Examine;
extern cl::extrahelp CommonHelp;
+10 -4
ファイルの表示
@@ -312,9 +312,16 @@ const std::map<llvm::StringRef, hipCounter> CUDA_TYPE_NAME_MAP{
/// Maps cuda header names to hip header names.
const std::map <llvm::StringRef, hipCounter> CUDA_INCLUDE_MAP{
// CUDA includes
{"cuda.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}},
{"cuda_runtime.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}},
{"cuda_runtime_api.h", {"hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}},
{"cuda.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_DRIVER}},
{"cuda_runtime.h", {"hip/hip_runtime.h", CONV_INCLUDE_CUDA_MAIN_H, API_RUNTIME}},
{"cuda_runtime_api.h", {"hip/hip_runtime_api.h", CONV_INCLUDE, API_RUNTIME}},
{"channel_descriptor.h", {"hip/channel_descriptor.h", CONV_INCLUDE, API_RUNTIME}},
{"device_functions.h", {"hip/device_functions.h", CONV_INCLUDE, API_RUNTIME}},
{"driver_types.h", {"hip/driver_types.h", CONV_INCLUDE, API_RUNTIME}},
{"cuComplex.h", {"hip/hip_complex.h", CONV_INCLUDE, API_RUNTIME}},
{"cuda_fp16.h", {"hip/hip_fp16.h", CONV_INCLUDE, API_RUNTIME}},
{"cuda_texture_types.h", {"hip/hip_texture_types.h", CONV_INCLUDE, API_RUNTIME}},
{"vector_types.h", {"hip/hip_vector_types.h", CONV_INCLUDE, API_RUNTIME}},
// CUBLAS includes
{"cublas.h", {"hipblas.h", CONV_INCLUDE, API_BLAS}},
@@ -2677,7 +2684,6 @@ const std::map<llvm::StringRef, hipCounter>& CUDA_RENAMES_MAP() {
// First run, so compute the union map.
ret = CUDA_IDENTIFIER_MAP;
ret.insert(CUDA_INCLUDE_MAP.begin(), CUDA_INCLUDE_MAP.end());
ret.insert(CUDA_TYPE_NAME_MAP.begin(), CUDA_TYPE_NAME_MAP.end());
return ret;
-861
ファイルの表示
@@ -1,861 +0,0 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file Cuda2Hip.cpp
*
* This file is compiled and linked into clang based hipify tool.
*/
#include "clang/ASTMatchers/ASTMatchFinder.h"
#include "clang/ASTMatchers/ASTMatchers.h"
#include "clang/Basic/SourceManager.h"
#include "clang/Frontend/CompilerInstance.h"
#include "clang/Frontend/FrontendActions.h"
#include "clang/Frontend/TextDiagnosticPrinter.h"
#include "clang/Lex/Lexer.h"
#include "clang/Lex/MacroArgs.h"
#include "clang/Lex/MacroInfo.h"
#include "clang/Lex/PPCallbacks.h"
#include "clang/Lex/Preprocessor.h"
#include "clang/Rewrite/Core/Rewriter.h"
#include "clang/Tooling/CommonOptionsParser.h"
#include "clang/Tooling/Refactoring.h"
#include "clang/Tooling/Tooling.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/MemoryBuffer.h"
#include "llvm/Support/Signals.h"
#include <cstdio>
#include <fstream>
#include <set>
#include <cmath>
#include <chrono>
#include <iomanip>
#include <sstream>
#include "CUDA2HipMap.h"
#include "LLVMCompat.h"
#include "StringUtils.h"
using namespace clang;
using namespace clang::ast_matchers;
using namespace clang::tooling;
using namespace llvm;
#define DEBUG_TYPE "cuda2hip"
// Set up the command line options
static cl::OptionCategory ToolTemplateCategory("CUDA to HIP source translator options");
static cl::opt<std::string> OutputFilename("o",
cl::desc("Output filename"),
cl::value_desc("filename"),
cl::cat(ToolTemplateCategory));
static cl::opt<bool> Inplace("inplace",
cl::desc("Modify input file inplace, replacing input with hipified output, save backup in .prehip file"),
cl::value_desc("inplace"),
cl::cat(ToolTemplateCategory));
static cl::opt<bool> NoBackup("no-backup",
cl::desc("Don't create a backup file for the hipified source"),
cl::value_desc("no-backup"),
cl::cat(ToolTemplateCategory));
static cl::opt<bool> NoOutput("no-output",
cl::desc("Don't write any translated output to stdout"),
cl::value_desc("no-output"),
cl::cat(ToolTemplateCategory));
static cl::opt<bool> PrintStats("print-stats",
cl::desc("Print translation statistics"),
cl::value_desc("print-stats"),
cl::cat(ToolTemplateCategory));
static cl::opt<std::string> OutputStatsFilename("o-stats",
cl::desc("Output filename for statistics"),
cl::value_desc("filename"),
cl::cat(ToolTemplateCategory));
static cl::opt<bool> Examine("examine",
cl::desc("Combines -no-output and -print-stats options"),
cl::value_desc("examine"),
cl::cat(ToolTemplateCategory));
static cl::extrahelp CommonHelp(CommonOptionsParser::HelpMessage);
class Cuda2Hip {
public:
Cuda2Hip(Replacements& R, const std::string &srcFileName) :
Replace(R), mainFileName(srcFileName) {}
enum msgTypes {
HIPIFY_ERROR = 0,
HIPIFY_WARNING
};
std::string getMsgType(msgTypes type) {
switch (type) {
case HIPIFY_ERROR: return "error";
default:
case HIPIFY_WARNING: return "warning";
}
}
protected:
Replacements& Replace;
std::string mainFileName;
virtual void insertReplacement(const Replacement &rep, const FullSourceLoc &fullSL) {
llcompat::insertReplacement(Replace, rep);
if (PrintStats) {
rep.getLength();
Statistics::current().lineTouched(fullSL.getExpansionLineNumber());
Statistics::current().bytesChanged(rep.getLength());
}
}
void insertHipHeaders(Cuda2Hip *owner, const SourceManager &SM) {
if (Replace.size() > 0) {
std::string repName = "#include <hip/hip_runtime.h>";
Statistics::current().incrementCounter({repName, ConvTypes::CONV_INCLUDE_CUDA_MAIN_H, ApiTypes::API_RUNTIME}, "#include <cuda>");
SourceLocation sl = SM.getLocForStartOfFile(SM.getMainFileID());
FullSourceLoc fullSL(sl, SM);
Replacement Rep(SM, sl, 0, repName + "\n");
insertReplacement(Rep, fullSL);
}
}
void printHipifyMessage(const SourceManager &SM, const SourceLocation &sl, const std::string &message, msgTypes msgType = HIPIFY_WARNING) {
FullSourceLoc fullSL(sl, SM);
llvm::errs() << "[HIPIFY] " << getMsgType(msgType) << ": " << mainFileName << ":" << fullSL.getExpansionLineNumber() << ":" << fullSL.getExpansionColumnNumber() << ": " << message << "\n";
}
void processString(StringRef s, SourceManager &SM, SourceLocation start) {
size_t begin = 0;
while ((begin = s.find("cu", begin)) != StringRef::npos) {
const size_t end = s.find_first_of(" ", begin + 4);
StringRef name = s.slice(begin, end);
const auto found = CUDA_RENAMES_MAP().find(name);
if (found != CUDA_RENAMES_MAP().end()) {
StringRef repName = found->second.hipName;
hipCounter counter = {"[string literal]", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.unsupported};
Statistics::current().incrementCounter(counter, name.str());
if (!counter.unsupported) {
SourceLocation sl = start.getLocWithOffset(begin + 1);
Replacement Rep(SM, sl, name.size(), repName);
FullSourceLoc fullSL(sl, SM);
insertReplacement(Rep, fullSL);
}
}
if (end == StringRef::npos) {
break;
}
begin = end + 1;
}
}
};
class Cuda2HipCallback;
class HipifyPPCallbacks : public PPCallbacks, public SourceFileCallbacks, public Cuda2Hip {
public:
HipifyPPCallbacks(Replacements& R, const std::string &mainFileName)
: Cuda2Hip(R, mainFileName) {}
virtual bool handleBeginSource(CompilerInstance &CI
#if LLVM_VERSION_MAJOR <= 4
, StringRef Filename
#endif
) override {
Preprocessor &PP = CI.getPreprocessor();
SourceManager &SM = CI.getSourceManager();
setSourceManager(&SM);
PP.addPPCallbacks(std::unique_ptr<HipifyPPCallbacks>(this));
setPreprocessor(&PP);
return true;
}
virtual void handleEndSource() override;
virtual void InclusionDirective(SourceLocation hash_loc,
const Token &include_token,
StringRef file_name, bool is_angled,
CharSourceRange filename_range,
const FileEntry *file, StringRef search_path,
StringRef relative_path,
const clang::Module *imported) override {
if (!_sm->isWrittenInMainFile(hash_loc) || !is_angled) {
return; // We're looking to rewrite angle-includes in the main file to point to hip.
}
const auto found = CUDA_INCLUDE_MAP.find(file_name);
if (found == CUDA_INCLUDE_MAP.end()) {
// Not a CUDA include - don't touch it.
return;
}
Statistics::current().incrementCounter(found->second, file_name.str());
if (found->second.unsupported) {
// An unsupported CUDA header? Oh dear. Print a warning.
printHipifyMessage(*_sm, hash_loc, "Unsupported CUDA header used: " + file_name.str());
return;
}
StringRef repName = found->second.hipName;
DEBUG(dbgs() << "Include file found: " << file_name << "\n"
<< "SourceLocation: "
<< filename_range.getBegin().printToString(*_sm) << "\n"
<< "Will be replaced with " << repName << "\n");
SourceLocation sl = filename_range.getBegin();
SourceLocation sle = filename_range.getEnd();
const char *B = _sm->getCharacterData(sl);
const char *E = _sm->getCharacterData(sle);
SmallString<128> tmpData;
Replacement Rep(*_sm, sl, E - B, Twine("<" + repName + ">").toStringRef(tmpData));
FullSourceLoc fullSL(sl, *_sm);
insertReplacement(Rep, fullSL);
}
/**
* Look at, and consider altering, a given token.
*
* If it's not a CUDA identifier, nothing happens.
* If it's an unsupported CUDA identifier, a warning is emitted.
* Otherwise, the source file is updated with the corresponding hipification.
*/
void RewriteToken(Token t) {
// String literals containing CUDA references need fixing...
if (t.is(tok::string_literal)) {
StringRef s(t.getLiteralData(), t.getLength());
processString(unquoteStr(s), *_sm, t.getLocation());
return;
} else if (!t.isAnyIdentifier()) {
// If it's neither a string nor an identifier, we don't care.
return;
}
StringRef name = t.getIdentifierInfo()->getName();
const auto found = CUDA_RENAMES_MAP().find(name);
if (found == CUDA_RENAMES_MAP().end()) {
// So it's an identifier, but not CUDA? Boring.
return;
}
Statistics::current().incrementCounter(found->second, name.str());
SourceLocation sl = t.getLocation();
if (found->second.unsupported) {
// An unsupported identifier? Curses! Warn the user.
printHipifyMessage(*_sm, sl, "Unsupported CUDA identifier used: " + name.str());
return;
}
StringRef repName = found->second.hipName;
Replacement Rep(*_sm, sl, name.size(), repName);
FullSourceLoc fullSL(sl, *_sm);
insertReplacement(Rep, fullSL);
}
virtual void MacroDefined(const Token &MacroNameTok,
const MacroDirective *MD) override {
if (!_sm->isWrittenInMainFile(MD->getLocation()) ||
MD->getKind() != MacroDirective::MD_Define) {
return;
}
for (auto T : MD->getMacroInfo()->tokens()) {
RewriteToken(T);
}
}
virtual void MacroExpands(const Token &MacroNameTok,
const MacroDefinition &MD, SourceRange Range,
const MacroArgs *Args) override {
if (!_sm->isWrittenInMainFile(MacroNameTok.getLocation())) {
return; // Macros in headers are not our concern.
}
// Is the macro itself a CUDA identifier? If so, rewrite it
RewriteToken(MacroNameTok);
// If it's a macro with arguments, rewrite all the arguments as hip, too.
for (unsigned int i = 0; Args && i < MD.getMacroInfo()->GET_NUM_ARGS(); i++) {
std::vector<Token> toks;
// Code below is a kind of stolen from 'MacroArgs::getPreExpArgument'
// to workaround the 'const' MacroArgs passed into this hook.
const Token *start = Args->getUnexpArgument(i);
size_t len = Args->getArgLength(start) + 1;
llcompat::EnterPreprocessorTokenStream(*_pp, start, len, false);
do {
toks.push_back(Token());
Token &tk = toks.back();
_pp->Lex(tk);
} while (toks.back().isNot(tok::eof));
_pp->RemoveTopOfLexerStack();
// end of stolen code
for (auto tok : toks) {
RewriteToken(tok);
}
}
}
void EndOfMainFile() override {}
void setSourceManager(SourceManager *sm) { _sm = sm; }
void setPreprocessor(Preprocessor *pp) { _pp = pp; }
void setMatch(Cuda2HipCallback *match) { Match = match; }
private:
SourceManager *_sm = nullptr;
Preprocessor *_pp = nullptr;
Cuda2HipCallback *Match = nullptr;
};
class Cuda2HipCallback : public MatchFinder::MatchCallback, public Cuda2Hip {
private:
bool cudaCall(const MatchFinder::MatchResult &Result) {
const CallExpr *call = Result.Nodes.getNodeAs<CallExpr>("cudaCall");
if (!call) {
return false; // Another handler will do it.
}
const FunctionDecl *funcDcl = call->getDirectCallee();
std::string name = funcDcl->getDeclName().getAsString();
SourceManager *SM = Result.SourceManager;
SourceLocation sl = call->getLocStart();
// TODO: Make a lookup table just for functions to improve performance.
const auto found = CUDA_IDENTIFIER_MAP.find(name);
if (found == CUDA_IDENTIFIER_MAP.end()) {
std::string msg = "the following reference is not handled: '" + name + "' [function call].";
printHipifyMessage(*SM, sl, msg);
return true;
}
const hipCounter& hipCtr = found->second;
Statistics::current().incrementCounter(hipCtr, name);
if (hipCtr.unsupported) {
return true; // Silently fail when you find an unsupported member.
// TODO: Print a warning with the diagnostics API?
}
size_t length = name.size();
Replacement Rep(*SM, sl, length, hipCtr.hipName);
FullSourceLoc fullSL(sl, *SM);
insertReplacement(Rep, fullSL);
return true;
}
SourceRange getReadRange(clang::SourceManager &SM, const SourceRange &exprRange) {
SourceLocation begin = exprRange.getBegin();
SourceLocation end = exprRange.getEnd();
bool beginSafe = !SM.isMacroBodyExpansion(begin) || Lexer::isAtStartOfMacroExpansion(begin, SM, LangOptions{});
bool endSafe = !SM.isMacroBodyExpansion(end) || Lexer::isAtEndOfMacroExpansion(end, SM, LangOptions{});
if (beginSafe && endSafe) {
return {SM.getFileLoc(begin), SM.getFileLoc(end)};
} else {
return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
}
}
SourceRange getWriteRange(clang::SourceManager &SM, const SourceRange &exprRange) {
SourceLocation begin = exprRange.getBegin();
SourceLocation end = exprRange.getEnd();
// If the range is contained within a macro, update the macro definition.
// Otherwise, use the file location and hope for the best.
if (!SM.isMacroBodyExpansion(begin) || !SM.isMacroBodyExpansion(end)) {
return {SM.getFileLoc(begin), SM.getFileLoc(end)};
}
return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
}
StringRef readSourceText(clang::SourceManager& SM, const SourceRange& exprRange) {
return Lexer::getSourceText(CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, LangOptions(), nullptr);
}
/**
* Get a string representation of the expression `arg`, unless it's a defaulting function
* call argument, in which case get a 0. Used for building argument lists to kernel calls.
*/
std::string stringifyZeroDefaultedArg(SourceManager& SM, const Expr* arg) {
if (isa<CXXDefaultArgExpr>(arg)) {
return "0";
} else {
return readSourceText(SM, arg->getSourceRange());
}
}
bool cudaLaunchKernel(const MatchFinder::MatchResult &Result) {
StringRef refName = "cudaLaunchKernel";
if (const CUDAKernelCallExpr *launchKernel = Result.Nodes.getNodeAs<CUDAKernelCallExpr>(refName)) {
SmallString<40> XStr;
raw_svector_ostream OS(XStr);
LangOptions DefaultLangOptions;
SourceManager *SM = Result.SourceManager;
const Expr& calleeExpr = *(launchKernel->getCallee());
OS << "hipLaunchKernelGGL(" << readSourceText(*SM, calleeExpr.getSourceRange()) << ", ";
// Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
const CallExpr& config = *(launchKernel->getConfig());
// Copy the two dimensional arguments verbatim.
OS << "dim3(" << readSourceText(*SM, config.getArg(0)->getSourceRange()) << "), ";
OS << "dim3(" << readSourceText(*SM, config.getArg(1)->getSourceRange()) << "), ";
// The stream/memory arguments default to zero if omitted.
OS << stringifyZeroDefaultedArg(*SM, config.getArg(2)) << ", ";
OS << stringifyZeroDefaultedArg(*SM, config.getArg(3));
// If there are ordinary arguments to the kernel, just copy them verbatim into our new call.
int numArgs = launchKernel->getNumArgs();
if (numArgs > 0) {
OS << ", ";
// Start of the first argument.
SourceLocation argStart = launchKernel->getArg(0)->getLocStart();
// End of the last argument.
SourceLocation argEnd = launchKernel->getArg(numArgs - 1)->getLocEnd();
OS << readSourceText(*SM, {argStart, argEnd});
}
OS << ")";
SourceRange replacementRange = getWriteRange(*SM, {launchKernel->getLocStart(), launchKernel->getLocEnd()});
SourceLocation launchStart = replacementRange.getBegin();
SourceLocation launchEnd = replacementRange.getEnd();
size_t length = SM->getCharacterData(Lexer::getLocForEndOfToken(
launchEnd, 0, *SM, DefaultLangOptions)) -
SM->getCharacterData(launchStart);
Replacement Rep(*SM, launchStart, length, OS.str());
FullSourceLoc fullSL(launchStart, *SM);
insertReplacement(Rep, fullSL);
hipCounter counter = {"hipLaunchKernelGGL", ConvTypes::CONV_KERN, ApiTypes::API_RUNTIME};
Statistics::current().incrementCounter(counter, refName.str());
return true;
}
return false;
}
bool cudaBuiltin(const MatchFinder::MatchResult &Result) {
if (const MemberExpr *threadIdx = Result.Nodes.getNodeAs<MemberExpr>("cudaBuiltin")) {
if (const OpaqueValueExpr *refBase =
dyn_cast<OpaqueValueExpr>(threadIdx->getBase())) {
if (const DeclRefExpr *declRef =
dyn_cast<DeclRefExpr>(refBase->getSourceExpr())) {
SourceLocation sl = threadIdx->getLocStart();
SourceManager *SM = Result.SourceManager;
StringRef name = declRef->getDecl()->getName();
StringRef memberName = threadIdx->getMemberDecl()->getName();
size_t pos = memberName.find_first_not_of("__fetch_builtin_");
memberName = memberName.slice(pos, memberName.size());
SmallString<128> tmpData;
name = Twine(name + "." + memberName).toStringRef(tmpData);
// TODO: Make a lookup table just for builtins to improve performance.
const auto found = CUDA_IDENTIFIER_MAP.find(name);
if (found != CUDA_IDENTIFIER_MAP.end()) {
Statistics::current().incrementCounter(found->second, name.str());
if (!found->second.unsupported) {
StringRef repName = found->second.hipName;
Replacement Rep(*SM, sl, name.size(), repName);
FullSourceLoc fullSL(sl, *SM);
insertReplacement(Rep, fullSL);
}
} else {
std::string msg = "the following reference is not handled: '" + name.str() + "' [builtin].";
printHipifyMessage(*SM, sl, msg);
}
}
}
return true;
}
return false;
}
bool cudaEnumConstantRef(const MatchFinder::MatchResult &Result) {
if (const DeclRefExpr *enumConstantRef = Result.Nodes.getNodeAs<DeclRefExpr>("cudaEnumConstantRef")) {
StringRef name = enumConstantRef->getDecl()->getName();
SourceLocation sl = enumConstantRef->getLocStart();
SourceManager *SM = Result.SourceManager;
// TODO: Make a lookup table just for enum values to improve performance.
const auto found = CUDA_IDENTIFIER_MAP.find(name);
if (found != CUDA_IDENTIFIER_MAP.end()) {
Statistics::current().incrementCounter(found->second, name.str());
if (!found->second.unsupported) {
StringRef repName = found->second.hipName;
Replacement Rep(*SM, sl, name.size(), repName);
FullSourceLoc fullSL(sl, *SM);
insertReplacement(Rep, fullSL);
}
} else {
std::string msg = "the following reference is not handled: '" + name.str() + "' [enum constant ref].";
printHipifyMessage(*SM, sl, msg);
}
return true;
}
return false;
}
bool cudaType(const MatchFinder::MatchResult& Result) {
const clang::TypeLoc* ret = Result.Nodes.getNodeAs<TypeLoc>("cudaType");
if (!ret) {
return false;
}
// Ignore qualifiers - they don't alter our decision to rename.
clang::UnqualTypeLoc tl = ret->getUnqualifiedLoc();
const Type& typeObject = *(tl.getTypePtr());
std::string typeName = tl.getType().getAsString();
// Irritatingly, enum/struct types are identified as `enum/struct <something>`, and unlike most compound
// types (such as pointers or references), there isn't another type node inside. So we have
// to make do with what we've got. There's probably a better way of doing this...
if (typeObject.isEnumeralType()) {
removePrefixIfPresent(typeName, "enum ");
}
if (typeObject.isStructureType()) {
removePrefixIfPresent(typeName, "struct ");
}
// Do we have a replacement for this type?
const auto found = CUDA_TYPE_NAME_MAP.find(typeName);
if (found == CUDA_TYPE_NAME_MAP.end()) {
return false;
}
SourceManager &SM = *(Result.SourceManager);
// Start of the type expression to replace.
SourceLocation sl = tl.getBeginLoc();
const hipCounter& hipCtr = found->second;
if (hipCtr.unsupported) {
printHipifyMessage(SM, sl, "Unsupported CUDA '" + typeName);
return false;
}
// Apply the rename!
Replacement Rep(SM, sl, typeName.size(), hipCtr.hipName);
FullSourceLoc fullSL(sl, SM);
insertReplacement(Rep, fullSL);
return true;
}
bool cudaSharedIncompleteArrayVar(const MatchFinder::MatchResult &Result) {
StringRef refName = "cudaSharedIncompleteArrayVar";
if (const VarDecl *sharedVar = Result.Nodes.getNodeAs<VarDecl>(refName)) {
// Example: extern __shared__ uint sRadix1[];
if (sharedVar->hasExternalFormalLinkage()) {
QualType QT = sharedVar->getType();
std::string typeName;
if (QT->isIncompleteArrayType()) {
const ArrayType *AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
QT = AT->getElementType();
if (QT.getTypePtr()->isBuiltinType()) {
QT = QT.getCanonicalType();
const BuiltinType *BT = dyn_cast<BuiltinType>(QT);
if (BT) {
LangOptions LO;
LO.CUDA = true;
PrintingPolicy policy(LO);
typeName = BT->getName(policy);
}
} else {
typeName = QT.getAsString();
}
}
if (!typeName.empty()) {
SourceLocation slStart = sharedVar->getLocStart();
SourceLocation slEnd = sharedVar->getLocEnd();
SourceManager *SM = Result.SourceManager;
size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
std::string varName = sharedVar->getNameAsString();
std::string repName = "HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")";
Replacement Rep(*SM, slStart, repLength, repName);
FullSourceLoc fullSL(slStart, *SM);
insertReplacement(Rep, fullSL);
hipCounter counter = { "HIP_DYNAMIC_SHARED", ConvTypes::CONV_MEM, ApiTypes::API_RUNTIME };
Statistics::current().incrementCounter(counter, refName.str());
}
}
return true;
}
return false;
}
bool stringLiteral(const MatchFinder::MatchResult &Result) {
if (const clang::StringLiteral *sLiteral = Result.Nodes.getNodeAs<clang::StringLiteral>("stringLiteral")) {
if (sLiteral->getCharByteWidth() == 1) {
StringRef s = sLiteral->getString();
SourceManager *SM = Result.SourceManager;
processString(s, *SM, sLiteral->getLocStart());
}
return true;
}
return false;
}
public:
Cuda2HipCallback(Replacements& Replace, ast_matchers::MatchFinder *parent, HipifyPPCallbacks *PPCallbacks, const std::string &mainFileName)
: Cuda2Hip(Replace, mainFileName), owner(parent), PP(PPCallbacks) {
PP->setMatch(this);
}
void run(const MatchFinder::MatchResult &Result) override {
if (cudaType(Result)) return;
if (cudaCall(Result)) return;
if (cudaBuiltin(Result)) return;
if (cudaEnumConstantRef(Result)) return;
if (cudaLaunchKernel(Result)) return;
if (cudaSharedIncompleteArrayVar(Result)) return;
if (stringLiteral(Result)) return;
}
private:
ast_matchers::MatchFinder *owner;
HipifyPPCallbacks *PP;
};
void HipifyPPCallbacks::handleEndSource() {
insertHipHeaders(Match, *_sm);
}
void addAllMatchers(ast_matchers::MatchFinder &Finder, Cuda2HipCallback *Callback) {
// Rewrite CUDA api calls to hip ones.
Finder.addMatcher(
callExpr(
isExpansionInMainFile(),
callee(
functionDecl(
matchesName("cu.*"),
unless(
// Clang generates structs with functions on them to represent things like
// threadIdx.x. We have other logic to handle those builtins directly, so
// we need to suppress the call-handling.
// We can't handle those directly in the call-handler without special-casing
// it unpleasantly, since the names of the functions are unique only per-struct.
matchesName("__fetch_builtin.*")
)
)
)
).bind("cudaCall"),
Callback
);
// Rewrite all references to CUDA types to their corresponding hip types.
Finder.addMatcher(
typeLoc(
isExpansionInMainFile()
).bind("cudaType"),
Callback
);
// Replace references to CUDA names in string literals with the equivalent hip names.
Finder.addMatcher(stringLiteral(isExpansionInMainFile()).bind("stringLiteral"), Callback);
// Replace the <<<...>>> language extension with a hip kernel launch
Finder.addMatcher(cudaKernelCallExpr(isExpansionInMainFile()).bind("cudaLaunchKernel"), Callback);
// Replace cuda builtins.
Finder.addMatcher(
memberExpr(
isExpansionInMainFile(),
hasObjectExpression(
hasType(
cxxRecordDecl(
matchesName("__cuda_builtin_")
)
)
)
).bind("cudaBuiltin"),
Callback
);
// Map CUDA enum _values_ to their hip equivalents.
Finder.addMatcher(
declRefExpr(
isExpansionInMainFile(),
to(
enumConstantDecl(
matchesName("cu.*|CU.*")
)
)
).bind("cudaEnumConstantRef"),
Callback
);
Finder.addMatcher(
varDecl(
isExpansionInMainFile(),
allOf(
hasAttr(attr::CUDAShared),
hasType(incompleteArrayType())
)
).bind("cudaSharedIncompleteArrayVar"),
Callback
);
}
void copyFile(const std::string& src, const std::string& dst) {
std::ifstream source(src, std::ios::binary);
std::ofstream dest(dst, std::ios::binary);
dest << source.rdbuf();
}
int main(int argc, const char **argv) {
llcompat::PrintStackTraceOnErrorSignal();
CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore);
std::vector<std::string> fileSources = OptionsParser.getSourcePathList();
std::string dst = OutputFilename;
if (!dst.empty() && fileSources.size() > 1) {
llvm::errs() << "[HIPIFY] conflict: -o and multiple source files are specified.\n";
return 1;
}
if (NoOutput) {
if (Inplace) {
llvm::errs() << "[HIPIFY] conflict: both -no-output and -inplace options are specified.\n";
return 1;
}
if (!dst.empty()) {
llvm::errs() << "[HIPIFY] conflict: both -no-output and -o options are specified.\n";
return 1;
}
}
if (Examine) {
NoOutput = PrintStats = true;
}
int Result = 0;
// Arguments for the Statistics print routines.
std::unique_ptr<std::ostream> csv = nullptr;
llvm::raw_ostream* statPrint = nullptr;
if (!OutputStatsFilename.empty()) {
csv = std::unique_ptr<std::ostream>(new std::ofstream(OutputStatsFilename, std::ios_base::trunc));
}
if (PrintStats) {
statPrint = &llvm::errs();
}
for (const auto & src : fileSources) {
if (dst.empty()) {
if (Inplace) {
dst = src;
} else {
dst = src + ".hip";
}
} else if (Inplace) {
llvm::errs() << "[HIPIFY] conflict: both -o and -inplace options are specified.\n";
return 1;
}
std::string tmpFile = src + ".hipify-tmp";
// Create a copy of the file to work on. When we're done, we'll move this onto the
// output (which may mean overwriting the input, if we're in-place).
// Should we fail for some reason, we'll just leak this file and not corrupt the input.
copyFile(src, tmpFile);
// Initialise the statistics counters for this file.
Statistics::setActive(src);
// RefactoringTool operates on the file in-place. Giving it the output path is no good,
// because that'll break relative includes, and we don't want to overwrite the input file.
// So what we do is operate on a copy, which we then move to the output.
RefactoringTool Tool(OptionsParser.getCompilations(), tmpFile);
ast_matchers::MatchFinder Finder;
// The Replacements to apply to the file `src`.
Replacements& replacementsToUse = llcompat::getReplacements(Tool, tmpFile);
HipifyPPCallbacks* PPCallbacks = new HipifyPPCallbacks(replacementsToUse, tmpFile);
Cuda2HipCallback Callback(replacementsToUse, &Finder, PPCallbacks, tmpFile);
addAllMatchers(Finder, &Callback);
auto action = newFrontendActionFactory(&Finder, PPCallbacks);
Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("--cuda-host-only", ArgumentInsertPosition::BEGIN));
// Ensure at least c++11 is used.
Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-std=c++11", ArgumentInsertPosition::BEGIN));
#if defined(HIPIFY_CLANG_RES)
Tool.appendArgumentsAdjuster(getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES));
#endif
Tool.appendArgumentsAdjuster(getClangSyntaxOnlyAdjuster());
Result += Tool.run(action.get());
Tool.clearArgumentsAdjusters();
LangOptions DefaultLangOptions;
IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts = new DiagnosticOptions();
TextDiagnosticPrinter DiagnosticPrinter(llvm::errs(), &*DiagOpts);
DiagnosticsEngine Diagnostics(IntrusiveRefCntPtr<DiagnosticIDs>(new DiagnosticIDs()), &*DiagOpts, &DiagnosticPrinter, false);
SourceManager SM(Diagnostics, Tool.getFiles());
Rewriter Rewrite(SM, DefaultLangOptions);
if (!Tool.applyAllReplacements(Rewrite)) {
DEBUG(dbgs() << "Skipped some replacements.\n");
}
// Either move the tmpfile to the output, or remove it.
if (!NoOutput) {
Result += Rewrite.overwriteChangedFiles();
rename(tmpFile.c_str(), dst.c_str());
} else {
remove(tmpFile.c_str());
}
Statistics::current().markCompletion();
Statistics::current().print(csv.get(), statPrint);
dst.clear();
}
if (fileSources.size() > 1) {
Statistics::printAggregate(csv.get(), statPrint);
}
return Result;
}
+460
ファイルの表示
@@ -0,0 +1,460 @@
#include "HipifyAction.h"
#include <memory>
#include "clang/Basic/SourceLocation.h"
#include "clang/Frontend/CompilerInstance.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"
#include "clang/ASTMatchers/ASTMatchers.h"
#include "LLVMCompat.h"
#include "CUDA2HipMap.h"
#include "StringUtils.h"
#include "ArgParse.h"
namespace ct = clang::tooling;
namespace mat = clang::ast_matchers;
void HipifyAction::RewriteString(StringRef s, clang::SourceLocation start) {
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
size_t begin = 0;
while ((begin = s.find("cu", begin)) != StringRef::npos) {
const size_t end = s.find_first_of(" ", begin + 4);
StringRef name = s.slice(begin, end);
const auto found = CUDA_RENAMES_MAP().find(name);
if (found != CUDA_RENAMES_MAP().end()) {
StringRef repName = found->second.hipName;
hipCounter counter = {"[string literal]", ConvTypes::CONV_LITERAL, ApiTypes::API_RUNTIME, found->second.unsupported};
Statistics::current().incrementCounter(counter, name.str());
if (!counter.unsupported) {
clang::SourceLocation sl = start.getLocWithOffset(begin + 1);
ct::Replacement Rep(SM, sl, name.size(), repName);
clang::FullSourceLoc fullSL(sl, SM);
insertReplacement(Rep, fullSL);
}
}
if (end == StringRef::npos) {
break;
}
begin = end + 1;
}
}
/**
* Look at, and consider altering, a given token.
*
* If it's not a CUDA identifier, nothing happens.
* If it's an unsupported CUDA identifier, a warning is emitted.
* Otherwise, the source file is updated with the corresponding hipification.
*/
void HipifyAction::RewriteToken(const clang::Token& t) {
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
// String literals containing CUDA references need fixing...
if (t.is(clang::tok::string_literal)) {
StringRef s(t.getLiteralData(), t.getLength());
RewriteString(unquoteStr(s), t.getLocation());
return;
} else if (!t.isAnyIdentifier()) {
// If it's neither a string nor an identifier, we don't care.
return;
}
StringRef name = t.getRawIdentifier();
const auto found = CUDA_RENAMES_MAP().find(name);
if (found == CUDA_RENAMES_MAP().end()) {
// So it's an identifier, but not CUDA? Boring.
return;
}
Statistics::current().incrementCounter(found->second, name.str());
clang::SourceLocation sl = t.getLocation();
if (found->second.unsupported) {
// An unsupported identifier? Curses! Warn the user.
clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "CUDA identifier unsupported in hip");
DE.Report(sl, ID);
return;
}
StringRef repName = found->second.hipName;
ct::Replacement Rep(SM, sl, name.size(), repName);
clang::FullSourceLoc fullSL(sl, SM);
insertReplacement(Rep, fullSL);
}
namespace {
clang::SourceRange getReadRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
clang::SourceLocation begin = exprRange.getBegin();
clang::SourceLocation end = exprRange.getEnd();
bool beginSafe = !SM.isMacroBodyExpansion(begin) || clang::Lexer::isAtStartOfMacroExpansion(begin, SM, clang::LangOptions{});
bool endSafe = !SM.isMacroBodyExpansion(end) || clang::Lexer::isAtEndOfMacroExpansion(end, SM, clang::LangOptions{});
if (beginSafe && endSafe) {
return {SM.getFileLoc(begin), SM.getFileLoc(end)};
} else {
return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
}
}
clang::SourceRange getWriteRange(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
clang::SourceLocation begin = exprRange.getBegin();
clang::SourceLocation end = exprRange.getEnd();
// If the range is contained within a macro, update the macro definition.
// Otherwise, use the file location and hope for the best.
if (!SM.isMacroBodyExpansion(begin) || !SM.isMacroBodyExpansion(end)) {
return {SM.getFileLoc(begin), SM.getFileLoc(end)};
}
return {SM.getSpellingLoc(begin), SM.getSpellingLoc(end)};
}
StringRef readSourceText(clang::SourceManager& SM, const clang::SourceRange& exprRange) {
return clang::Lexer::getSourceText(clang::CharSourceRange::getTokenRange(getReadRange(SM, exprRange)), SM, clang::LangOptions(), nullptr);
}
/**
* Get a string representation of the expression `arg`, unless it's a defaulting function
* call argument, in which case get a 0. Used for building argument lists to kernel calls.
*/
std::string stringifyZeroDefaultedArg(clang::SourceManager& SM, const clang::Expr* arg) {
if (clang::isa<clang::CXXDefaultArgExpr>(arg)) {
return "0";
} else {
return readSourceText(SM, arg->getSourceRange());
}
}
} // anonymous namespace
void HipifyAction::InclusionDirective(clang::SourceLocation hash_loc,
const clang::Token&,
StringRef file_name,
bool is_angled,
clang::CharSourceRange filename_range,
const clang::FileEntry*, StringRef,
StringRef, const clang::Module*) {
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
if (!SM.isWrittenInMainFile(hash_loc)) {
return;
}
const auto found = CUDA_INCLUDE_MAP.find(file_name);
if (found == CUDA_INCLUDE_MAP.end()) {
// Not a CUDA include - don't touch it.
return;
}
// Special-casing to avoid duplication of the hip_runtime include.
if (found->second.hipName == "hip/hip_runtime.h") {
if (insertedRuntimeHeader) {
return;
}
insertedRuntimeHeader = true;
}
Statistics::current().incrementCounter(found->second, file_name.str());
clang::SourceLocation sl = filename_range.getBegin();
if (found->second.unsupported) {
// An unsupported CUDA header? Oh dear. Print a warning.
clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
DE.Report(sl, DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unsupported CUDA header"));
return;
}
const char *B = SM.getCharacterData(sl);
const char *E = SM.getCharacterData(filename_range.getEnd());
clang::SmallString<128> includeBuffer;
clang::StringRef newInclude;
// Keep the same include type that the user gave.
if (is_angled) {
newInclude = llvm::Twine("<" + found->second.hipName + ">").toStringRef(includeBuffer);
} else {
newInclude = llvm::Twine("\"" + found->second.hipName + "\"").toStringRef(includeBuffer);
}
ct::Replacement Rep(SM, sl, E - B, newInclude);
insertReplacement(Rep, clang::FullSourceLoc{sl, SM});
}
bool HipifyAction::cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
StringRef refName = "cudaLaunchKernel";
const auto* launchKernel = Result.Nodes.getNodeAs<clang::CUDAKernelCallExpr>(refName);
if (!launchKernel) {
return false;
}
clang::SmallString<40> XStr;
llvm::raw_svector_ostream OS(XStr);
clang::LangOptions DefaultLangOptions;
clang::SourceManager* SM = Result.SourceManager;
const clang::Expr& calleeExpr = *(launchKernel->getCallee());
OS << "hipLaunchKernelGGL(" << readSourceText(*SM, calleeExpr.getSourceRange()) << ", ";
// Next up are the four kernel configuration parameters, the last two of which are optional and default to zero.
const clang::CallExpr& config = *(launchKernel->getConfig());
// Copy the two dimensional arguments verbatim.
OS << "dim3(" << readSourceText(*SM, config.getArg(0)->getSourceRange()) << "), ";
OS << "dim3(" << readSourceText(*SM, config.getArg(1)->getSourceRange()) << "), ";
// The stream/memory arguments default to zero if omitted.
OS << stringifyZeroDefaultedArg(*SM, config.getArg(2)) << ", ";
OS << stringifyZeroDefaultedArg(*SM, config.getArg(3));
// If there are ordinary arguments to the kernel, just copy them verbatim into our new call.
int numArgs = launchKernel->getNumArgs();
if (numArgs > 0) {
OS << ", ";
// Start of the first argument.
clang::SourceLocation argStart = launchKernel->getArg(0)->getLocStart();
// End of the last argument.
clang::SourceLocation argEnd = launchKernel->getArg(numArgs - 1)->getLocEnd();
OS << readSourceText(*SM, {argStart, argEnd});
}
OS << ")";
clang::SourceRange replacementRange = getWriteRange(*SM, {launchKernel->getLocStart(), launchKernel->getLocEnd()});
clang::SourceLocation launchStart = replacementRange.getBegin();
clang::SourceLocation launchEnd = replacementRange.getEnd();
size_t length = SM->getCharacterData(clang::Lexer::getLocForEndOfToken(launchEnd, 0, *SM, DefaultLangOptions)) - SM->getCharacterData(launchStart);
ct::Replacement Rep(*SM, launchStart, length, OS.str());
clang::FullSourceLoc fullSL(launchStart, *SM);
insertReplacement(Rep, fullSL);
hipCounter counter = {"hipLaunchKernelGGL", ConvTypes::CONV_KERN, ApiTypes::API_RUNTIME};
Statistics::current().incrementCounter(counter, refName.str());
return true;
}
bool HipifyAction::cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
const clang::MemberExpr* threadIdx = Result.Nodes.getNodeAs<clang::MemberExpr>("cudaBuiltin");
if (!threadIdx) {
return false;
}
const clang::OpaqueValueExpr* refBase = clang::dyn_cast<clang::OpaqueValueExpr>(threadIdx->getBase());
if (!refBase) {
return false;
}
const clang::DeclRefExpr* declRef = clang::dyn_cast<clang::DeclRefExpr>(refBase->getSourceExpr());
if (!declRef) {
return false;
}
clang::SourceLocation sl = threadIdx->getLocStart();
clang::SourceManager* SM = Result.SourceManager;
StringRef name = declRef->getDecl()->getName();
StringRef memberName = threadIdx->getMemberDecl()->getName();
size_t pos = memberName.find_first_not_of("__fetch_builtin_");
memberName = memberName.slice(pos, memberName.size());
clang::SmallString<128> tmpData;
name = clang::Twine(name + "." + memberName).toStringRef(tmpData);
const auto found = CUDA_IDENTIFIER_MAP.find(name);
if (found != CUDA_IDENTIFIER_MAP.end()) {
Statistics::current().incrementCounter(found->second, name.str());
if (!found->second.unsupported) {
StringRef repName = found->second.hipName;
ct::Replacement Rep(*SM, sl, name.size(), repName);
clang::FullSourceLoc fullSL(sl, *SM);
insertReplacement(Rep, fullSL);
}
} else {
clang::DiagnosticsEngine& DE = getCompilerInstance().getDiagnostics();
const auto ID = DE.getCustomDiagID(clang::DiagnosticsEngine::Warning, "Unknown CUDA builtin");
DE.Report(sl, ID);
}
return true;
}
bool HipifyAction::cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
StringRef refName = "cudaSharedIncompleteArrayVar";
auto* sharedVar = Result.Nodes.getNodeAs<clang::VarDecl>(refName);
if (!sharedVar) {
return false;
}
// Example: extern __shared__ uint sRadix1[];
if (!sharedVar->hasExternalFormalLinkage()) {
return false;
}
clang::QualType QT = sharedVar->getType();
std::string typeName;
if (QT->isIncompleteArrayType()) {
const clang::ArrayType* AT = QT.getTypePtr()->getAsArrayTypeUnsafe();
QT = AT->getElementType();
if (QT.getTypePtr()->isBuiltinType()) {
QT = QT.getCanonicalType();
const auto* BT = clang::dyn_cast<clang::BuiltinType>(QT);
if (BT) {
clang::LangOptions LO;
LO.CUDA = true;
clang::PrintingPolicy policy(LO);
typeName = BT->getName(policy);
}
} else {
typeName = QT.getAsString();
}
}
if (!typeName.empty()) {
clang::SourceLocation slStart = sharedVar->getLocStart();
clang::SourceLocation slEnd = sharedVar->getLocEnd();
clang::SourceManager* SM = Result.SourceManager;
size_t repLength = SM->getCharacterData(slEnd) - SM->getCharacterData(slStart) + 1;
std::string varName = sharedVar->getNameAsString();
std::string repName = "HIP_DYNAMIC_SHARED(" + typeName + ", " + varName + ")";
ct::Replacement Rep(*SM, slStart, repLength, repName);
clang::FullSourceLoc fullSL(slStart, *SM);
insertReplacement(Rep, fullSL);
hipCounter counter = {"HIP_DYNAMIC_SHARED", ConvTypes::CONV_MEM, ApiTypes::API_RUNTIME};
Statistics::current().incrementCounter(counter, refName.str());
}
return true;
}
void HipifyAction::insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL) {
llcompat::insertReplacement(*replacements, rep);
if (PrintStats) {
rep.getLength();
Statistics::current().lineTouched(fullSL.getExpansionLineNumber());
Statistics::current().bytesChanged(rep.getLength());
}
}
std::unique_ptr<clang::ASTConsumer> HipifyAction::CreateASTConsumer(clang::CompilerInstance& CI, llvm::StringRef) {
Finder.reset(new clang::ast_matchers::MatchFinder);
// Replace the <<<...>>> language extension with a hip kernel launch
Finder->addMatcher(mat::cudaKernelCallExpr(mat::isExpansionInMainFile()).bind("cudaLaunchKernel"), this);
// Replace cuda builtins.
Finder->addMatcher(
mat::memberExpr(
mat::isExpansionInMainFile(),
mat::hasObjectExpression(
mat::hasType(
mat::cxxRecordDecl(
mat::matchesName("__cuda_builtin_")
)
)
)
).bind("cudaBuiltin"),
this
);
Finder->addMatcher(
mat::varDecl(
mat::isExpansionInMainFile(),
mat::allOf(
mat::hasAttr(clang::attr::CUDAShared),
mat::hasType(mat::incompleteArrayType())
)
).bind("cudaSharedIncompleteArrayVar"),
this
);
// Ownership is transferred to the caller...
return Finder->newASTConsumer();
}
void HipifyAction::EndSourceFileAction() {
// Insert the hip header, if we didn't already do it by accident during substitution.
if (!insertedRuntimeHeader) {
// It's not sufficient to just replace CUDA headers with hip ones, because numerous CUDA headers are
// implicitly included by the compiler. Instead, we _delete_ CUDA headers, and unconditionally insert
// one copy of the hip include into every file.
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
clang::SourceLocation sl = SM.getLocForStartOfFile(SM.getMainFileID());
clang::FullSourceLoc fullSL(sl, SM);
ct::Replacement Rep(SM, sl, 0, "#include <hip/hip_runtime.h>\n");
insertReplacement(Rep, fullSL);
}
clang::ASTFrontendAction::EndSourceFileAction();
}
namespace {
/**
* A silly little class to proxy PPCallbacks back to the HipifyAction class.
*/
class PPCallbackProxy : public clang::PPCallbacks {
HipifyAction& hipifyAction;
public:
explicit PPCallbackProxy(HipifyAction& action): hipifyAction(action) {}
void InclusionDirective(clang::SourceLocation hash_loc, const clang::Token& include_token,
StringRef file_name, bool is_angled, clang::CharSourceRange filename_range,
const clang::FileEntry* file, StringRef search_path, StringRef relative_path,
const clang::Module* imported) override {
hipifyAction.InclusionDirective(hash_loc, include_token, file_name, is_angled, filename_range, file, search_path, relative_path, imported);
}
};
}
void HipifyAction::ExecuteAction() {
clang::Preprocessor& PP = getCompilerInstance().getPreprocessor();
clang::SourceManager& SM = getCompilerInstance().getSourceManager();
// Start lexing the specified input file.
const llvm::MemoryBuffer* FromFile = SM.getBuffer(SM.getMainFileID());
clang::Lexer RawLex(SM.getMainFileID(), FromFile, SM, PP.getLangOpts());
RawLex.SetKeepWhitespaceMode(true);
// Perform a token-level rewrite of CUDA identifiers to hip ones. The raw-mode lexer gives us enough
// information to tell the difference between identifiers, string literals, and "other stuff". It also
// ignores preprocessor directives, so this transformation will operate inside preprocessor-deleted
// code.
clang::Token RawTok;
RawLex.LexFromRawLexer(RawTok);
while (RawTok.isNot(clang::tok::eof)) {
RewriteToken(RawTok);
RawLex.LexFromRawLexer(RawTok);
}
// Register yourself as the preprocessor callback, by proxy.
PP.addPPCallbacks(std::unique_ptr<PPCallbackProxy>(new PPCallbackProxy(*this)));
// Now we're done futzing with the lexer, have the subclass proceeed with Sema and AST matching.
clang::ASTFrontendAction::ExecuteAction();
}
void HipifyAction::run(const clang::ast_matchers::MatchFinder::MatchResult& Result) {
if (cudaBuiltin(Result)) return;
if (cudaLaunchKernel(Result)) return;
if (cudaSharedIncompleteArrayVar(Result)) return;
}
+83
ファイルの表示
@@ -0,0 +1,83 @@
#pragma once
#include "clang/Lex/PPCallbacks.h"
#include "clang/Tooling/Tooling.h"
#include "clang/Frontend/FrontendAction.h"
#include "clang/Tooling/Core/Replacement.h"
#include "clang/ASTMatchers/ASTMatchFinder.h"
#include "ReplacementsFrontendActionFactory.h"
namespace ct = clang::tooling;
/**
* A FrontendAction that hipifies CUDA programs.
*/
class HipifyAction : public clang::ASTFrontendAction,
public clang::ast_matchers::MatchFinder::MatchCallback {
private:
ct::Replacements* replacements;
std::unique_ptr<clang::ast_matchers::MatchFinder> Finder;
/// CUDA implicitly adds its runtime header. We rewrite explicitly-provided CUDA includes with equivalent
// ones, and track - using this flag - if the result led to us including the hip runtime header. If it did
// not, we insert it at the top of the file when we finish processing it.
// This approach means we do the best it's possible to do w.r.t preserving the user's include order.
bool insertedRuntimeHeader = false;
/**
* Rewrite a string literal to refer to hip, not CUDA.
*/
void RewriteString(StringRef s, clang::SourceLocation start);
/**
* Replace a CUDA identifier with the corresponding hip identifier, if applicable.
*/
void RewriteToken(const clang::Token &t);
public:
explicit HipifyAction(ct::Replacements *replacements):
clang::ASTFrontendAction(),
replacements(replacements) {}
// MatchCallback listeners
bool cudaBuiltin(const clang::ast_matchers::MatchFinder::MatchResult& Result);
bool cudaLaunchKernel(const clang::ast_matchers::MatchFinder::MatchResult& Result);
bool cudaSharedIncompleteArrayVar(const clang::ast_matchers::MatchFinder::MatchResult& Result);
/**
* Called by the preprocessor for each include directive during the non-raw lexing pass.
*/
void InclusionDirective(clang::SourceLocation hash_loc,
const clang::Token &include_token,
StringRef file_name,
bool is_angled,
clang::CharSourceRange filename_range,
const clang::FileEntry *file,
StringRef search_path,
StringRef relative_path,
const clang::Module *imported);
protected:
/**
* Add a Replacement for the current file. These will all be applied after executing the FrontendAction.
*/
void insertReplacement(const ct::Replacement& rep, const clang::FullSourceLoc& fullSL);
/**
* FrontendAction entry point.
*/
void ExecuteAction() override;
/**
* Called at the start of each new file to process.
*/
void EndSourceFileAction() override;
/**
* MatchCallback API entry point. Called by the AST visitor while searching the AST for things we registered an
* interest for.
*/
void run(const clang::ast_matchers::MatchFinder::MatchResult& Result) override;
std::unique_ptr<clang::ASTConsumer> CreateASTConsumer(clang::CompilerInstance &CI, llvm::StringRef InFile) override;
};
+28
ファイルの表示
@@ -0,0 +1,28 @@
#pragma once
#include "clang/Tooling/Tooling.h"
#include "clang/Frontend/FrontendAction.h"
#include "clang/Tooling/Core/Replacement.h"
namespace ct = clang::tooling;
/**
* A FrontendActionFactory that propagates a set of Replacements into the FrontendAction.
* This is necessary boilerplate for using a custom FrontendAction with a RefactoringTool.
*
* @tparam T The FrontendAction to create.
*/
template <typename T>
class ReplacementsFrontendActionFactory : public ct::FrontendActionFactory {
ct::Replacements* replacements;
public:
explicit ReplacementsFrontendActionFactory(ct::Replacements* r):
ct::FrontendActionFactory(),
replacements(r) {}
clang::FrontendAction* create() override {
return new T(replacements);
}
};
+155
ファイルの表示
@@ -0,0 +1,155 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/**
* @file Cuda2Hip.cpp
*
* This file is compiled and linked into clang based hipify tool.
*/
#include <cstdio>
#include <fstream>
#include <set>
#include <cmath>
#include <chrono>
#include <iomanip>
#include <sstream>
#include "CUDA2HipMap.h"
#include "LLVMCompat.h"
#include "HipifyAction.h"
#include "ArgParse.h"
#define DEBUG_TYPE "cuda2hip"
namespace ct = clang::tooling;
namespace {
void copyFile(const std::string& src, const std::string& dst) {
std::ifstream source(src, std::ios::binary);
std::ofstream dest(dst, std::ios::binary);
dest << source.rdbuf();
}
} // anonymous namespace
int main(int argc, const char **argv) {
llcompat::PrintStackTraceOnErrorSignal();
ct::CommonOptionsParser OptionsParser(argc, argv, ToolTemplateCategory, llvm::cl::OneOrMore);
std::vector<std::string> fileSources = OptionsParser.getSourcePathList();
std::string dst = OutputFilename;
if (!dst.empty() && fileSources.size() > 1) {
llvm::errs() << "[HIPIFY] conflict: -o and multiple source files are specified.\n";
return 1;
}
if (NoOutput) {
if (Inplace) {
llvm::errs() << "[HIPIFY] conflict: both -no-output and -inplace options are specified.\n";
return 1;
}
if (!dst.empty()) {
llvm::errs() << "[HIPIFY] conflict: both -no-output and -o options are specified.\n";
return 1;
}
}
if (Examine) {
NoOutput = PrintStats = true;
}
int Result = 0;
// Arguments for the Statistics print routines.
std::unique_ptr<std::ostream> csv = nullptr;
llvm::raw_ostream* statPrint = nullptr;
if (!OutputStatsFilename.empty()) {
csv = std::unique_ptr<std::ostream>(new std::ofstream(OutputStatsFilename, std::ios_base::trunc));
}
if (PrintStats) {
statPrint = &llvm::errs();
}
for (const auto & src : fileSources) {
if (dst.empty()) {
if (Inplace) {
dst = src;
} else {
dst = src + ".hip";
}
} else if (Inplace) {
llvm::errs() << "[HIPIFY] conflict: both -o and -inplace options are specified.\n";
return 1;
}
std::string tmpFile = src + ".hipify-tmp";
// Create a copy of the file to work on. When we're done, we'll move this onto the
// output (which may mean overwriting the input, if we're in-place).
// Should we fail for some reason, we'll just leak this file and not corrupt the input.
copyFile(src, tmpFile);
// Initialise the statistics counters for this file.
Statistics::setActive(src);
// RefactoringTool operates on the file in-place. Giving it the output path is no good,
// because that'll break relative includes, and we don't want to overwrite the input file.
// So what we do is operate on a copy, which we then move to the output.
ct::RefactoringTool Tool(OptionsParser.getCompilations(), tmpFile);
ct::Replacements& replacementsToUse = llcompat::getReplacements(Tool, tmpFile);
ReplacementsFrontendActionFactory<HipifyAction> actionFactory(&replacementsToUse);
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("--cuda-host-only", ct::ArgumentInsertPosition::BEGIN));
// Ensure at least c++11 is used.
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-std=c++11", ct::ArgumentInsertPosition::BEGIN));
#if defined(HIPIFY_CLANG_RES)
Tool.appendArgumentsAdjuster(ct::getInsertArgumentAdjuster("-resource-dir=" HIPIFY_CLANG_RES));
#endif
Tool.appendArgumentsAdjuster(ct::getClangSyntaxOnlyAdjuster());
// Hipify _all_ the things!
if (Tool.runAndSave(&actionFactory)) {
DEBUG(llvm::dbgs() << "Skipped some replacements.\n");
}
// Either move the tmpfile to the output, or remove it.
if (!NoOutput) {
rename(tmpFile.c_str(), dst.c_str());
} else {
remove(tmpFile.c_str());
}
Statistics::current().markCompletion();
Statistics::current().print(csv.get(), statPrint);
dst.clear();
}
if (fileSources.size() > 1) {
Statistics::printAggregate(csv.get(), statPrint);
}
return Result;
}
+43 -3
ファイルの表示
@@ -50,10 +50,16 @@ THE SOFTWARE.
#include <hip/hip_runtime_api.h>
// define HIP_ENABLE_PRINTF to enable printf
#ifdef HIP_ENABLE_PRINTF
#define HCC_ENABLE_ACCELERATOR_PRINTF 1
#endif
//---
// Remainder of this file only compiles with HCC
#if defined __HCC__
#include <grid_launch.h>
#include "hc_printf.hpp"
//TODO-HCC-GL - change this to typedef.
//typedef grid_launch_parm hipLaunchParm ;
@@ -108,13 +114,12 @@ extern int HIP_TRACE_API;
#if defined(__HCC_ACCELERATOR__) && (__HCC_ACCELERATOR__ != 0)
// Device compile and not host compile:
//TODO-HCC enable __HIP_ARCH_HAS_ATOMICS__ when HCC supports these.
// 32-bit Atomics:
#define __HIP_ARCH_HAS_GLOBAL_INT32_ATOMICS__ (1)
#define __HIP_ARCH_HAS_GLOBAL_FLOAT_ATOMIC_EXCH__ (1)
#define __HIP_ARCH_HAS_SHARED_INT32_ATOMICS__ (1)
#define __HIP_ARCH_HAS_SHARED_FLOAT_ATOMIC_EXCH__ (1)
#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (0)
#define __HIP_ARCH_HAS_FLOAT_ATOMIC_ADD__ (1)
// 64-bit Atomics:
#define __HIP_ARCH_HAS_GLOBAL_INT64_ATOMICS__ (1)
@@ -376,6 +381,27 @@ __device__ void __threadfence_system(void) ;
* @}
*/
template<typename std::common_type<
decltype(hc_get_group_id),
decltype(hc_get_group_size),
decltype(hc_get_num_groups),
decltype(hc_get_workitem_id)>::type f>
class Coordinates {
using R = decltype(f(0));
struct X { __device__ operator R() const { return f(0); } };
struct Y { __device__ operator R() const { return f(1); } };
struct Z { __device__ operator R() const { return f(2); } };
public:
static constexpr X x{};
static constexpr Y y{};
static constexpr Z z{};
};
static constexpr Coordinates<hc_get_group_size> blockDim;
static constexpr Coordinates<hc_get_group_id> blockIdx;
static constexpr Coordinates<hc_get_num_groups> gridDim;
static constexpr Coordinates<hc_get_workitem_id> threadIdx;
#define hipThreadIdx_x (hc_get_workitem_id(0))
#define hipThreadIdx_y (hc_get_workitem_id(1))
@@ -420,6 +446,20 @@ static inline __device__ void* memset(void* ptr, int val, size_t size)
}
#ifdef __HCC_ACCELERATOR__
#ifdef HC_FEATURE_PRINTF
template <typename... All>
static inline __device__ void printf(const char* format, All... all) {
hc::printf(format, all...);
}
#else
template <typename... All>
static inline __device__ void printf(const char* format, All... all) { }
#endif
#endif
#define __syncthreads() hc_barrier(CLK_LOCAL_MEM_FENCE)
@@ -462,7 +502,7 @@ do {\
type* var = \
(type*)__get_dynamicgroupbaseptr(); \
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
#define HIP_DYNAMIC_SHARED_ATTRIBUTE
+1 -1
ファイルの表示
@@ -28,7 +28,7 @@ set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "perl (>= 5.0)")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
+1 -1
ファイルの表示
@@ -33,7 +33,7 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
+3 -3
ファイルの表示
@@ -37,14 +37,14 @@ else()
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), ${HCC_PACKAGE_NAME} (= @HCC_PACKAGE_VERSION@)")
endif()
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
if(@COMPILE_HIP_ATP_MARKER@)
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler, libstdc++-static")
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, rocm-profiler")
else()
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@, libstdc++-static")
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}, ${HCC_PACKAGE_NAME} = @HCC_PACKAGE_VERSION@")
endif()
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
set(CPACK_SOURCE_GENERATOR "TGZ")
+1 -1
ファイルの表示
@@ -20,7 +20,7 @@ set(CPACK_BINARY_DEB "ON")
#set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJECT_BINARY_DIR}/postinst;${PROJECT_BINARY_DIR}/prerm")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION}), cuda (>= 7.5)")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
#set(CPACK_RPM_POST_INSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/postinst")
#set(CPACK_RPM_PRE_UNINSTALL_SCRIPT_FILE "${PROJECT_BINARY_DIR}/prerm")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
+1 -1
ファイルの表示
@@ -21,7 +21,7 @@ set(CPACK_GENERATOR "TGZ;DEB;RPM")
set(CPACK_BINARY_DEB "ON")
set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_base (= ${CPACK_PACKAGE_VERSION})")
set(CPACK_BINARY_RPM "ON")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "x86_64")
set(CPACK_RPM_PACKAGE_ARCHITECTURE "${CMAKE_SYSTEM_PROCESSOR}")
set(CPACK_RPM_PACKAGE_AUTOREQPROV " no")
set(CPACK_RPM_PACKAGE_REQUIRES "hip_base = ${CPACK_PACKAGE_VERSION}")
set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "/opt")
+3 -3
ファイルの表示
@@ -5,7 +5,7 @@ endif
HIPCC=$(HIP_PATH)/bin/hipcc
HIP_PLATFORM=$(shell $(HIP_PATH)/bin/hipconfig --compiler)
all: vcpy_kernel.code runKernel.hip.out defaultDriver.hip.out
all: vcpy_kernel.code runKernel.hip.out launchKernelHcc.hip.out
runKernel.hip.out: runKernel.cpp
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
@@ -13,8 +13,8 @@ runKernel.hip.out: runKernel.cpp
launchKernelHcc.hip.out: launchKernelHcc.cpp
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
defaultDriver.hip.out: defaultDriver.cpp
$(HIPCC) $(HIPCC_FLAGS) $< -o $@
#defaultDriver.hip.out: defaultDriver.cpp
# $(HIPCC) $(HIPCC_FLAGS) $< -o $@
vcpy_kernel.code: vcpy_kernel.cpp
$(HIPCC) --genco $(GENCO_FLAGS) $^ -o $@
+1
ファイルの表示
@@ -72,6 +72,7 @@ int main(){
uint32_t one = 1;
struct {
uint32_t _hidden[6];
void * _Ad;
void * _Bd;
} args;
+1
ファイルの表示
@@ -68,6 +68,7 @@ int main(){
uint32_t one = 1;
struct {
uint32_t _hidden[6];
void * _Ad;
void * _Bd;
} args;
-12
ファイルの表示
@@ -1,12 +0,0 @@
__kernel void memset(char in, __global int* out) {
int tx = get_global_id(0);
out[tx] = in;
}
__kernel void vadd(__global float *Ad, __global float *Bd, __global float *Cd, int N){
int tx = get_global_id(0);
if(tx < N){
Cd[tx] = Ad[tx] + Bd[tx];
}
}
バイナリファイルは表示されません。
+9 -4
ファイルの表示
@@ -23,6 +23,11 @@ THE SOFTWARE.
#include <hc_math.hpp>
#include "device_util.h"
extern "C" float __ocml_floor_f32(float);
extern "C" float __ocml_rint_f32(float);
extern "C" float __ocml_ceil_f32(float);
extern "C" float __ocml_trunc_f32(float);
struct holder64Bit{
union{
double d;
@@ -151,19 +156,19 @@ __device__ long long int __double_as_longlong(double x)
__device__ int __float2int_rd(float x)
{
return (int)x;
return (int)__ocml_floor_f32(x);
}
__device__ int __float2int_rn(float x)
{
return (int)x;
return (int)__ocml_rint_f32(x);
}
__device__ int __float2int_ru(float x)
{
return (int)x;
return (int)__ocml_ceil_f32(x);
}
__device__ int __float2int_rz(float x)
{
return (int)x;
return (int)__ocml_trunc_f32(x);
}
__device__ long long int __float2ll_rd(float x)
+6 -6
ファイルの表示
@@ -45,8 +45,8 @@ __device__ void *__hip_hc_malloc(size_t size)
{
return (void*)nullptr;
}
uint32_t totalThreads = hipBlockDim_x * hipGridDim_x * hipBlockDim_y * hipGridDim_y * hipBlockDim_z * hipGridDim_z;
uint32_t currentWorkItem = hipThreadIdx_x + hipBlockDim_x * hipBlockIdx_x;
uint32_t totalThreads = blockDim.x * gridDim.x * blockDim.y * gridDim.y * blockDim.z * gridDim.z;
uint32_t currentWorkItem = threadIdx.x + blockDim.x * blockIdx.x;
uint32_t numHeapsPerWorkItem = NUM_PAGES / totalThreads;
uint32_t heapSizePerWorkItem = SIZE_OF_HEAP / totalThreads;
@@ -932,7 +932,7 @@ __device__ unsigned long long int atomicMax(unsigned long long int* address,
template<typename T>
__device__ T atomicCAS_impl(T* address, T compare, T val)
{
// the implementation assumes the atomic is lock-free and
// the implementation assumes the atomic is lock-free and
// has the same size as the non-atmoic equivalent type
static_assert(sizeof(T) == sizeof(std::atomic<T>)
, "size mismatch between atomic and non-atomic types");
@@ -945,7 +945,7 @@ __device__ T atomicCAS_impl(T* address, T compare, T val)
T expected = compare;
// hcc should generate a system scope atomic CAS
// hcc should generate a system scope atomic CAS
std::atomic_compare_exchange_weak_explicit(u.atomic_address
, &expected, val
, std::memory_order_acq_rel
@@ -1110,8 +1110,8 @@ __device__ void* __get_dynamicgroupbaseptr() {
return hc::get_dynamic_group_segment_base_pointer();
}
__host__ void* __get_dynamicgroupbaseptr() {
return nullptr;
__host__ void* __get_dynamicgroupbaseptr() {
return nullptr;
}
// Precise Math Functions
+3
ファイルの表示
@@ -92,5 +92,8 @@ namespace hip_impl
delete static_cast<L*>(locked_stream);
locked_stream = nullptr;
if(HIP_PROFILE_API) {
MARKER_END();
}
}
}
+1 -1
ファイルの表示
@@ -269,7 +269,7 @@ hipError_t hipCtxGetSharedMemConfig ( hipSharedMemConfig * pConfig )
hipError_t hipCtxSynchronize ( void )
{
HIP_INIT_API(1);
return ihipSynchronize(); //TODP Shall check validity of ctx?
return ihipLogStatus(ihipSynchronize()); //TODP Shall check validity of ctx?
}
hipError_t hipCtxGetFlags ( unsigned int* flags )
+5 -5
ファイルの表示
@@ -60,12 +60,12 @@ hipError_t ihipGetDeviceCount(int *count)
*count = g_deviceCnt;
if (*count > 0) {
e = ihipLogStatus(hipSuccess);
e = hipSuccess;
} else {
e = ihipLogStatus(hipErrorNoDevice);
e = hipErrorNoDevice;
}
} else {
e = ihipLogStatus(hipErrorInvalidValue);
e = hipErrorInvalidValue;
}
return e;
}
@@ -73,7 +73,7 @@ hipError_t ihipGetDeviceCount(int *count)
hipError_t hipGetDeviceCount(int *count)
{
HIP_INIT_API(count);
return ihipGetDeviceCount(count);
return ihipLogStatus(ihipGetDeviceCount(count));
}
hipError_t hipDeviceSetCacheConfig(hipFuncCache_t cacheConfig)
@@ -205,7 +205,7 @@ hipError_t ihipDeviceGetAttribute(int* pi, hipDeviceAttribute_t attr, int device
hipError_t e = hipSuccess;
if(pi == nullptr) {
return ihipLogStatus(hipErrorInvalidValue);
return hipErrorInvalidValue;
}
auto * hipDevice = ihipGetDevice(device);
+104 -90
ファイルの表示
@@ -31,12 +31,9 @@ THE SOFTWARE.
ihipEvent_t::ihipEvent_t(unsigned flags)
: _criticalData(this)
{
_state = hipEventStatusCreated;
_stream = NULL;
_flags = flags;
_timestamp = 0;
_type = hipEventTypeIndependent;
};
@@ -45,56 +42,45 @@ ihipEvent_t::ihipEvent_t(unsigned flags)
void ihipEvent_t::attachToCompletionFuture(const hc::completion_future *cf,
hipStream_t stream, ihipEventType_t eventType)
{
_state = hipEventStatusRecording;
_marker = *cf;
_type = eventType;
_stream = stream;
LockedAccessor_EventCrit_t crit(_criticalData);
crit->_eventData.marker(*cf);
crit->_eventData._type = eventType;
crit->_eventData._stream = stream;
crit->_eventData._state = hipEventStatusRecording;
}
void ihipEvent_t::refereshEventStatus()
std::pair<hipEventStatus_t, uint64_t>
ihipEvent_t::refreshEventStatus()
{
bool isReady0 = locked_isReady();
bool isReady1;
int val = 0;
if (_state == hipEventStatusRecording) {
// TODO - use completion-future functions to obtain ticks and timestamps:
hsa_signal_t *sig = static_cast<hsa_signal_t*> (_marker.get_native_handle());
isReady1 = locked_isReady();
if (sig) {
val = hsa_signal_load_acquire(*sig);
if (val == 0) {
auto ecd = locked_copyCrit();
if (ecd._state == hipEventStatusRecording) {
bool isReady1 = ecd._stream->locked_eventIsReady(this);
if (isReady1) {
LockedAccessor_EventCrit_t eCrit(_criticalData);
if ((_type == hipEventTypeIndependent) || (_type == hipEventTypeStopCommand)) {
_timestamp = _marker.get_end_tick();
} else if (_type == hipEventTypeStartCommand) {
_timestamp = _marker.get_begin_tick();
} else {
assert(0); // TODO - move to debug assert
_timestamp = 0;
}
_state = hipEventStatusComplete;
if ((eCrit->_eventData._type == hipEventTypeIndependent) ||
(eCrit->_eventData._type == hipEventTypeStopCommand)) {
eCrit->_eventData._timestamp = eCrit->_eventData.marker().get_end_tick();
} else if (eCrit->_eventData._type == hipEventTypeStartCommand) {
eCrit->_eventData._timestamp = eCrit->_eventData.marker().get_begin_tick();
} else {
eCrit->_eventData._timestamp = 0;
assert(0); // TODO - move to debug assert
}
eCrit->_eventData._state = hipEventStatusComplete;
return std::pair<hipEventStatus_t, uint64_t> (eCrit->_eventData._state, eCrit->_eventData._timestamp);
}
}
}
if (_state != hipEventStatusComplete) {
//printf (" not ready isReady0=%d val=%d isReady1=%d\n", isReady0, val, isReady1);
}
// Not complete path here:
return std::pair<hipEventStatus_t, uint64_t> (ecd._state, ecd._timestamp);
}
bool ihipEvent_t::locked_isReady()
{
return _stream->locked_eventIsReady(this);
}
void ihipEvent_t::locked_waitComplete(hc::hcWaitMode waitMode)
{
return _stream->locked_eventWaitComplete(this, waitMode);
}
hipError_t ihipEventCreate(hipEvent_t* event, unsigned flags)
@@ -136,33 +122,43 @@ hipError_t hipEventCreate(hipEvent_t* event)
return ihipLogStatus(ihipEventCreate(event, 0));
}
hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
{
HIP_INIT_SPECIAL_API(TRACE_SYNC, event, stream);
if (event && event->_state != hipEventStatusUnitialized) {
auto ecd = event->locked_copyCrit();
if (event && ecd._state != hipEventStatusUnitialized) {
stream = ihipSyncAndResolveStream(stream);
event->_stream = stream;
if (HIP_SYNC_NULL_STREAM && stream->isDefaultStream()) {
// TODO-HIP_SYNC_NULL_STREAM : can remove this code when HIP_SYNC_NULL_STREAM = 0
//
// If default stream , then wait on all queues.
ihipCtx_t *ctx = ihipGetTlsDefaultCtx();
ctx->locked_syncDefaultStream(true, true);
event->_timestamp = hc::get_system_ticks();
event->_state = hipEventStatusComplete;
{
LockedAccessor_EventCrit_t eCrit(event->criticalData());
eCrit->_eventData.marker(hc::completion_future()); // reset event
eCrit->_eventData._stream = stream;
eCrit->_eventData._timestamp = hc::get_system_ticks();
eCrit->_eventData._state = hipEventStatusComplete;
}
return ihipLogStatus(hipSuccess);
} else {
event->_state = hipEventStatusRecording;
// Clear timestamps
event->_timestamp = 0;
// Record the event in the stream:
stream->locked_recordEvent(event);
// Keep a copy outside the critical section so we lock stream first, then event - to avoid deadlock
hc::completion_future cf = stream->locked_recordEvent(event);
{
LockedAccessor_EventCrit_t eCrit(event->criticalData());
eCrit->_eventData.marker(cf);
eCrit->_eventData._stream = stream;
eCrit->_eventData._timestamp = 0;
eCrit->_eventData._state = hipEventStatusRecording;
}
return ihipLogStatus(hipSuccess);
}
@@ -171,15 +167,13 @@ hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream)
}
}
hipError_t hipEventDestroy(hipEvent_t event)
{
HIP_INIT_API(event);
if (event) {
event->_state = hipEventStatusUnitialized;
delete event;
event = NULL;
return ihipLogStatus(hipSuccess);
} else {
@@ -191,19 +185,27 @@ hipError_t hipEventSynchronize(hipEvent_t event)
{
HIP_INIT_SPECIAL_API(TRACE_SYNC, event);
if (!(event->_flags & hipEventReleaseToSystem)) {
tprintf(DB_WARN, "hipEventSynchronize on event without system-scope fence ; consider creating with hipEventReleaseToSystem\n");
}
auto ecd = event->locked_copyCrit();
if (event) {
if (event->_state == hipEventStatusUnitialized) {
if (ecd._state == hipEventStatusUnitialized) {
return ihipLogStatus(hipErrorInvalidResourceHandle);
} else if (event->_state == hipEventStatusCreated ) {
} else if (ecd._state == hipEventStatusCreated ) {
// Created but not actually recorded on any device:
return ihipLogStatus(hipSuccess);
} else if (HIP_SYNC_NULL_STREAM && (event->_stream->isDefaultStream() )) {
} else if (HIP_SYNC_NULL_STREAM && (ecd._stream->isDefaultStream() )) {
auto *ctx = ihipGetTlsDefaultCtx();
// TODO-HIP_SYNC_NULL_STREAM - can remove this code
ctx->locked_syncDefaultStream(true, true);
return ihipLogStatus(hipSuccess);
} else {
event->locked_waitComplete((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive);
ecd._stream->locked_eventWaitComplete(
ecd.marker(),
(event->_flags & hipEventBlockingSync) ?
hc::hcWaitModeBlocked : hc::hcWaitModeActive);
return ihipLogStatus(hipSuccess);
}
@@ -220,44 +222,50 @@ hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop)
*ms = 0.0f;
if ((start == nullptr) ||
(start->_flags & hipEventDisableTiming) ||
(start->_state == hipEventStatusUnitialized) || (start->_state == hipEventStatusCreated) ||
(stop == nullptr) ||
(stop->_flags & hipEventDisableTiming) ||
( stop->_state == hipEventStatusUnitialized) || ( stop->_state == hipEventStatusCreated)) {
// Both events must be at least recorded else return hipErrorInvalidResourceHandle
if ((start == nullptr) || (stop == nullptr)) {
status = hipErrorInvalidResourceHandle;
} else {
// Refresh status, if still recording...
start->refereshEventStatus();
stop->refereshEventStatus();
auto startEcd = start->locked_copyCrit();
auto stopEcd = stop->locked_copyCrit();
if ((start->_state == hipEventStatusComplete) && (stop->_state == hipEventStatusComplete)) {
// Common case, we have good information for both events.
if ((start->_flags & hipEventDisableTiming) ||
(startEcd._state == hipEventStatusUnitialized) || (startEcd._state == hipEventStatusCreated) ||
(stop->_flags & hipEventDisableTiming) ||
(stopEcd._state == hipEventStatusUnitialized) || (stopEcd._state == hipEventStatusCreated)) {
int64_t tickDiff = (stop->timestamp() - start->timestamp());
// Both events must be at least recorded else return hipErrorInvalidResourceHandle
status = hipErrorInvalidResourceHandle;
uint64_t freqHz;
hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz);
if (freqHz) {
*ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f;
status = hipSuccess;
} else {
* ms = 0.0f;
status = hipErrorInvalidValue;
}
// Refresh status, if still recording...
auto startStatus = start->refreshEventStatus(); // pair < state, timestamp >
auto stopStatus = stop->refreshEventStatus(); // pair < state, timestamp >
if ((startStatus.first == hipEventStatusComplete) && (stopStatus.first == hipEventStatusComplete)) {
// Common case, we have good information for both events. 'second" is the timestamp:
int64_t tickDiff = (stopStatus.second - startStatus.second);
uint64_t freqHz;
hsa_system_get_info(HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, &freqHz);
if (freqHz) {
*ms = ((double)(tickDiff) / (double)(freqHz)) * 1000.0f;
status = hipSuccess;
} else {
* ms = 0.0f;
status = hipErrorInvalidValue;
}
} else if ((start->_state == hipEventStatusRecording) ||
(stop->_state == hipEventStatusRecording)) {
} else if ((startStatus.first == hipEventStatusRecording) ||
(stopStatus.first == hipEventStatusRecording)) {
status = hipErrorNotReady;
} else {
status = hipErrorNotReady;
} else {
assert(0);
}
}
}
@@ -268,7 +276,13 @@ hipError_t hipEventQuery(hipEvent_t event)
{
HIP_INIT_SPECIAL_API(TRACE_QUERY, event);
if ((event->_state == hipEventStatusRecording) && !event->locked_isReady()) {
if (!(event->_flags & hipEventReleaseToSystem)) {
tprintf(DB_WARN, "hipEventQuery on event without system-scope fence ; consider creating with hipEventReleaseToSystem\n");
}
auto ecd = event->locked_copyCrit();
if ((ecd._state == hipEventStatusRecording) && !ecd._stream->locked_eventIsReady(event)) {
return ihipLogStatus(hipErrorNotReady);
} else {
return ihipLogStatus(hipSuccess);
+77 -49
ファイルの表示
@@ -47,6 +47,9 @@ THE SOFTWARE.
#include "trace_helper.h"
#include "env.h"
//TODO - create a stream-based debug interface as an additional option for tprintf
#define DB_PEER_CTX 0
//=================================================================================================
//Global variables:
@@ -156,7 +159,7 @@ thread_local TidInfo tls_tidInfo;
//=================================================================================================
// Top-level "free" functions:
//=================================================================================================
void recordApiTrace(std::string *fullStr, const std::string &apiStr)
uint64_t recordApiTrace(std::string *fullStr, const std::string &apiStr)
{
auto apiSeqNum = tls_tidInfo.apiSeqNum();
auto tid = tls_tidInfo.tid();
@@ -178,10 +181,14 @@ void recordApiTrace(std::string *fullStr, const std::string &apiStr)
*fullStr += " ";
*fullStr += apiStr;
uint64_t apiStartTick = getTicks();
if (COMPILE_HIP_DB && HIP_TRACE_API) {
fprintf (stderr, "%s<<hip-api tid:%s%s\n" , API_COLOR, fullStr->c_str(), API_COLOR_END);
fprintf (stderr, "%s<<hip-api tid:%s @%lu%s\n" , API_COLOR, fullStr->c_str(), apiStartTick, API_COLOR_END);
}
return apiStartTick;
}
@@ -332,12 +339,11 @@ void ihipStream_t::locked_wait()
// Causes current stream to wait for specified event to complete:
// Note this does not provide any kind of host serialization.
void ihipStream_t::locked_streamWaitEvent(hipEvent_t event)
void ihipStream_t::locked_streamWaitEvent(ihipEventData_t &ecd)
{
LockedAccessor_StreamCrit_t crit(_criticalData);
crit->_av.create_blocking_marker(event->marker(), hc::accelerator_scope);
crit->_av.create_blocking_marker(ecd.marker(), hc::accelerator_scope);
}
@@ -345,24 +351,28 @@ void ihipStream_t::locked_streamWaitEvent(hipEvent_t event)
// Note this does not provide any kind of host serialization.
bool ihipStream_t::locked_eventIsReady(hipEvent_t event)
{
// Event query that returns "Complete" may cause HCC to manipulate
// internal queue state so lock the stream's queue here.
LockedAccessor_StreamCrit_t crit(_criticalData);
LockedAccessor_StreamCrit_t scrit(_criticalData);
return (event->marker().is_ready());
LockedAccessor_EventCrit_t ecrit(event->criticalData());
return (ecrit->_eventData.marker().is_ready());
}
void ihipStream_t::locked_eventWaitComplete(hipEvent_t event, hc::hcWaitMode waitMode)
// Waiting on event can cause HCC to reclaim stream resources - so need to lock the stream.
void ihipStream_t::locked_eventWaitComplete(hc::completion_future &marker, hc::hcWaitMode waitMode)
{
LockedAccessor_StreamCrit_t crit(_criticalData);
event->marker().wait(waitMode);
marker.wait(waitMode);
}
// Create a marker in this stream.
// Save state in the event so it can track the status of the event.
void ihipStream_t::locked_recordEvent(hipEvent_t event)
hc::completion_future ihipStream_t::locked_recordEvent(hipEvent_t event)
{
// Lock the stream to prevent simultaneous access
LockedAccessor_StreamCrit_t crit(_criticalData);
@@ -378,7 +388,7 @@ void ihipStream_t::locked_recordEvent(hipEvent_t event)
scopeFlag = HIP_EVENT_SYS_RELEASE ? hc::system_scope : hc::accelerator_scope;
}
event->marker(crit->_av.create_marker(scopeFlag));
return crit->_av.create_marker(scopeFlag);
};
//=============================================================================
@@ -459,7 +469,9 @@ void ihipCtxCriticalBase_t<CtxMutex>::recomputePeerAgents()
template<>
bool ihipCtxCriticalBase_t<CtxMutex>::isPeerWatcher(const ihipCtx_t *peer)
{
auto match = std::find(_peers.begin(), _peers.end(), peer);
auto match = std::find_if(_peers.begin(), _peers.end(),
[=] (const ihipCtx_t *d) { return d->getDeviceNum() == peer->getDeviceNum(); });
return (match != std::end(_peers));
}
@@ -616,7 +628,7 @@ void ihipDevice_t::locked_reset()
//FIXME - Calling am_memtracker_reset is really bad since it destroyed all buffers allocated by the HCC runtime as well
//such as the printf buffer. Re-initialze the printf buffer as a workaround for now.
#if (__hcc_workweek__ >= 17423)
#ifdef HC_FEATURE_PRINTF
Kalmar::getContext()->initPrintfBuffer();
#endif
};
@@ -700,26 +712,25 @@ int checkAccess(hsa_agent_t agent, hsa_amd_memory_pool_t pool)
return access;
}
hsa_status_t get_region_info(hsa_region_t region, void* data)
hsa_status_t get_pool_info(hsa_amd_memory_pool_t pool, void* data)
{
hsa_status_t err;
hipDeviceProp_t* p_prop = reinterpret_cast<hipDeviceProp_t*>(data);
uint32_t region_segment;
// Get region segment
err = hsa_region_get_info(region, HSA_REGION_INFO_SEGMENT, &region_segment);
// Get pool segment
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SEGMENT, &region_segment);
ErrorCheck(err);
switch(region_segment) {
case HSA_REGION_SEGMENT_READONLY:
err = hsa_region_get_info(region, HSA_REGION_INFO_SIZE, &(p_prop->totalConstMem)); break;
/* case HSA_REGION_SEGMENT_PRIVATE:
cout<<"PRIVATE"<<endl; private segment cannot be queried */
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &(p_prop->totalConstMem)); break;
case HSA_REGION_SEGMENT_GROUP:
err = hsa_region_get_info(region, HSA_REGION_INFO_SIZE, &(p_prop->sharedMemPerBlock)); break;
err = hsa_amd_memory_pool_get_info(pool, HSA_AMD_MEMORY_POOL_INFO_SIZE, &(p_prop->sharedMemPerBlock));
break;
default: break;
}
return HSA_STATUS_SUCCESS;
return err;
}
@@ -750,11 +761,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
hipError_t e = hipSuccess;
hsa_status_t err;
// Set some defaults in case we don't find the appropriate regions:
prop->totalGlobalMem = 0;
prop->totalConstMem = 0;
prop-> maxThreadsPerMultiProcessor = 0;
prop->regsPerBlock = 0;
memset(prop, 0, sizeof(hipDeviceProp_t));
if (_hsaAgent.handle == -1) {
return hipErrorInvalidDevice;
@@ -854,15 +861,18 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
prop-> maxThreadsPerMultiProcessor = prop->warpSize*max_waves_per_cu;
// Get memory properties
err = hsa_agent_iterate_regions(_hsaAgent, get_region_info, prop);
err = hsa_amd_agent_iterate_memory_pools(_hsaAgent, get_pool_info, prop);
if (err == HSA_STATUS_INFO_BREAK) {
err = HSA_STATUS_SUCCESS;
}
DeviceErrorCheck(err);
// Get the size of the region we are using for Accelerator Memory allocations:
// Get the size of the pool we are using for Accelerator Memory allocations:
hsa_region_t *am_region = static_cast<hsa_region_t*>(_acc.get_hsa_am_region());
err = hsa_region_get_info(*am_region, HSA_REGION_INFO_SIZE, &prop->totalGlobalMem);
DeviceErrorCheck(err);
// maxSharedMemoryPerMultiProcessor should be as the same as group memory size.
// Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group region size.
// Group memory will not be paged out, so, the physical memory size is the total shared memory size, and also equal to the group pool size.
prop->maxSharedMemoryPerMultiProcessor = prop->totalGlobalMem;
// Get Max memory clock frequency
@@ -882,7 +892,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
prop->arch.hasGlobalFloatAtomicExch = 1;
prop->arch.hasSharedInt32Atomics = 1;
prop->arch.hasSharedFloatAtomicExch = 1;
prop->arch.hasFloatAtomicAdd = 0;
prop->arch.hasFloatAtomicAdd = 1; // supported with CAS loop, but is supported
prop->arch.hasGlobalInt64Atomics = 1;
prop->arch.hasSharedInt64Atomics = 1;
prop->arch.hasDoubles = 1;
@@ -890,7 +900,7 @@ hipError_t ihipDevice_t::initProperties(hipDeviceProp_t* prop)
prop->arch.hasWarpBallot = 1;
prop->arch.hasWarpShuffle = 1;
prop->arch.hasFunnelShift = 0; // TODO-hcc
prop->arch.hasThreadFenceSystem = 0; // TODO-hcc
prop->arch.hasThreadFenceSystem = 1;
prop->arch.hasSyncThreadsExt = 0; // TODO-hcc
prop->arch.hasSurfaceFuncs = 0; // TODO-hcc
prop->arch.has3dGrid = 1;
@@ -1582,7 +1592,9 @@ void ihipPostLaunchKernel(const char *kernelName, hipStream_t stream, grid_launc
tprintf(DB_SYNC, "ihipPostLaunchKernel, unlocking stream\n");
stream->lockclose_postKernelCommand(kernelName, lp.av);
MARKER_END();
if(HIP_PROFILE_API) {
MARKER_END();
}
}
//=================================================================================================
@@ -1677,6 +1689,9 @@ const char *ihipErrorString(hipError_t hip_error)
// So we check dstCtx's and srcCtx's peerList to see if the both include thisCtx.
bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPointerInfo *dstPtrInfo, const hc::AmPointerInfo *srcPtrInfo)
{
if (copyEngineCtx == nullptr) {
return false;
}
// Make sure this is a device-to-device copy with all memory available to the requested copy engine
//
@@ -1684,11 +1699,18 @@ bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPoin
if (dstPtrInfo->_sizeBytes == 0) {
return false;
} else {
#if USE_APP_PTR_FOR_CTX
ihipCtx_t *dstCtx = static_cast<ihipCtx_t*> (dstPtrInfo->_appPtr);
#else
ihipCtx_t *dstCtx = ihipGetPrimaryCtx(dstPtrInfo->_appId);
#endif
if (copyEngineCtx != dstCtx) {
// Only checks peer list if contexts are different
LockedAccessor_CtxCrit_t ctxCrit(dstCtx->criticalData());
//tprintf(DB_SYNC, "dstCrit lock succeeded\n");
#if DB_PEER_CTX
std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " dstCtx =" << dstCtx << " peerCnt="
<< ctxCrit->peerCnt() << "\n";
#endif
if (!ctxCrit->isPeerWatcher(copyEngineCtx)) {
return false;
};
@@ -1696,16 +1718,22 @@ bool ihipStream_t::canSeeMemory(const ihipCtx_t *copyEngineCtx, const hc::AmPoin
}
// TODO - pointer-info stores a deviceID not a context,may have some unusual side-effects here:
if (srcPtrInfo->_sizeBytes == 0) {
return false;
} else {
#if USE_APP_PTR_FOR_CTX
ihipCtx_t *srcCtx = static_cast<ihipCtx_t*> (srcPtrInfo->_appPtr);
#else
ihipCtx_t *srcCtx = ihipGetPrimaryCtx(srcPtrInfo->_appId);
#endif
if (copyEngineCtx != srcCtx) {
// Only checks peer list if contexts are different
LockedAccessor_CtxCrit_t ctxCrit(srcCtx->criticalData());
//tprintf(DB_SYNC, "srcCrit lock succeeded\n");
#if DB_PEER_CTX
std::cerr << "checking peer : copyEngineCtx =" << copyEngineCtx << " srcCtx =" << srcCtx << " peerCnt="
<< ctxCrit->peerCnt() << "\n";
#endif
if (!ctxCrit->isPeerWatcher(copyEngineCtx)) {
return false;
};
@@ -1805,7 +1833,7 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind,
}
} else {
*forceUnpinnedCopy = true;
tprintf (DB_COPY, "P2P: Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n",
tprintf (DB_COPY, "Copy engine(dev:%d agent=0x%lx) cannot see both host and device pointers - forcing copy with unpinned engine.\n",
*copyDevice ? (*copyDevice)->getDeviceNum() : -1,
*copyDevice ? (*copyDevice)->getDevice()->_hsaAgent.handle : 0x0);
if (HIP_FAIL_SOC & 0x2) {
@@ -1820,10 +1848,11 @@ void ihipStream_t::resolveHcMemcpyDirection(unsigned hipMemKind,
void printPointerInfo(unsigned dbFlag, const char *tag, const void *ptr, const hc::AmPointerInfo &ptrInfo)
{
tprintf (dbFlag, " %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d registered=%d\n",
tprintf (dbFlag, " %s=%p baseHost=%p baseDev=%p sz=%zu home_dev=%d tracked=%d isDevMem=%d registered=%d allocSeqNum=%zu, appAllocationFlags=%x, appPtr=%p\n",
tag, ptr,
ptrInfo._hostPointer, ptrInfo._devicePointer, ptrInfo._sizeBytes,
ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged);
ptrInfo._appId, ptrInfo._sizeBytes != 0, ptrInfo._isInDeviceMem, !ptrInfo._isAmManaged,
ptrInfo._allocSeqNum, ptrInfo._appAllocationFlags, ptrInfo._appPtr);
}
@@ -1871,12 +1900,14 @@ void tailorPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeByte
};
bool getTailoredPtrInfo(hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes)
bool getTailoredPtrInfo(const char *tag, hc::AmPointerInfo *ptrInfo, const void * ptr, size_t sizeBytes)
{
bool tracked = (hc::am_memtracker_getinfo(ptrInfo, ptr) == AM_SUCCESS);
printPointerInfo(DB_COPY, tag, ptr, *ptrInfo);
if (tracked) {
tailorPtrInfo(ptrInfo, ptr, sizeBytes);
printPointerInfo(DB_COPY, " mod", ptr, *ptrInfo);
}
return tracked;
@@ -1906,8 +1937,8 @@ void ihipStream_t::locked_copySync(void* dst, const void* src, size_t sizeBytes,
hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);
#endif
bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes);
bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes);
bool dstTracked = getTailoredPtrInfo(" dst", &dstPtrInfo, dst, sizeBytes);
bool srcTracked = getTailoredPtrInfo(" src", &srcPtrInfo, src, sizeBytes);
// Some code in HCC and in printPointerInfo uses _sizeBytes==0 as an indication ptr is not valid, so check it here:
@@ -2034,21 +2065,18 @@ void ihipStream_t::locked_copyAsync(void* dst, const void* src, size_t sizeBytes
hc::AmPointerInfo dstPtrInfo(NULL, NULL, 0, acc, 0, 0);
hc::AmPointerInfo srcPtrInfo(NULL, NULL, 0, acc, 0, 0);
#endif
bool dstTracked = getTailoredPtrInfo(&dstPtrInfo, dst, sizeBytes);
bool srcTracked = getTailoredPtrInfo(&srcPtrInfo, src, sizeBytes);
tprintf (DB_COPY, "copyASync dst=%p src=%p, sz=%zu\n", dst, src, sizeBytes);
bool dstTracked = getTailoredPtrInfo(" dst", &dstPtrInfo, dst, sizeBytes);
bool srcTracked = getTailoredPtrInfo(" src", &srcPtrInfo, src, sizeBytes);
hc::hcCommandKind hcCopyDir;
ihipCtx_t *copyDevice;
bool forceUnpinnedCopy;
resolveHcMemcpyDirection(kind, &dstPtrInfo, &srcPtrInfo, &hcCopyDir, &copyDevice, &forceUnpinnedCopy);
tprintf (DB_COPY, "copyASync copyDev:%d dst=%p (phys_dev:%d, isDevMem:%d) src=%p(phys_dev:%d, isDevMem:%d) sz=%zu dir=%s forceUnpinnedCopy=%d\n",
tprintf (DB_COPY, " copyDev:%d dir=%s forceUnpinnedCopy=%d\n",
copyDevice ? copyDevice->getDeviceNum():-1,
dst, dstPtrInfo._appId, dstPtrInfo._isInDeviceMem,
src, srcPtrInfo._appId, srcPtrInfo._isInDeviceMem,
sizeBytes, hcMemcpyStr(hcCopyDir), forceUnpinnedCopy);
printPointerInfo(DB_COPY, " dst", dst, dstPtrInfo);
printPointerInfo(DB_COPY, " src", src, srcPtrInfo);
hcMemcpyStr(hcCopyDir), forceUnpinnedCopy);
// "tracked" really indicates if the pointer's virtual address is available in the GPU address space.
// If both pointers are not tracked, we need to fall back to a sync copy.
+98 -27
ファイルの表示
@@ -32,10 +32,19 @@ THE SOFTWARE.
#include "env.h"
#if defined(__HCC__) && (__hcc_workweek__ < 16354)
#if (__hcc_workweek__ < 16354)
#error("This version of HIP requires a newer version of HCC.");
#endif
// Use the __appPtr field in the am memtracker to store the context.
// Requires a bug fix in HCC
#if defined(__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE) and (__HCC_HAS_EXTENDED_AM_MEMTRACKER_UPDATE != 0)
#define USE_APP_PTR_FOR_CTX 1
#endif
#define USE_IPC 1
//---
@@ -128,6 +137,7 @@ extern std::vector<ProfTrigger> g_dbStopTriggers;
class ihipStream_t;
class ihipDevice_t;
class ihipCtx_t;
struct ihipEventData_t;
// Color defs for debug messages:
#define KNRM "\x1B[0m"
@@ -143,10 +153,12 @@ extern const char *API_COLOR;
extern const char *API_COLOR_END;
// If set, thread-safety is enforced on all stream functions.
// Stream functions will acquire a mutex before entering critical sections.
#define STREAM_THREAD_SAFE 1
// If set, thread-safety is enforced on all event/stream/ctx/device functions.
// Can disable for performance or functional experiments - in this case
// the code uses a dummy "no-op" mutex.
#define EVENT_THREAD_SAFE 1
#define STREAM_THREAD_SAFE 1
#define CTX_THREAD_SAFE 1
@@ -209,7 +221,8 @@ extern const char *API_COLOR_END;
#define DB_SYNC 1 /* 0x02 - trace synchronization pieces */
#define DB_MEM 2 /* 0x04 - trace memory allocation / deallocation */
#define DB_COPY 3 /* 0x08 - trace memory copy and peer commands. . */
#define DB_MAX_FLAG 4
#define DB_WARN 4 /* 0x10 - warn about sub-optimal or shady behavior */
#define DB_MAX_FLAG 5
// When adding a new debug flag, also add to the char name table below.
//
//
@@ -226,6 +239,7 @@ static const DbName dbName [] =
{KYEL, "sync"},
{KCYN, "mem"},
{KMAG, "copy"},
{KRED, "warn"},
};
@@ -244,23 +258,28 @@ static const DbName dbName [] =
#endif
static inline uint64_t getTicks()
{
return hc::get_system_ticks();
}
//---
extern void recordApiTrace(std::string *fullStr, const std::string &apiStr);
extern uint64_t recordApiTrace(std::string *fullStr, const std::string &apiStr);
#if COMPILE_HIP_ATP_MARKER || (COMPILE_HIP_TRACE_API & 0x1)
#define API_TRACE(forceTrace, ...)\
uint64_t hipApiStartTick;\
{\
tls_tidInfo.incApiSeqNum();\
if (forceTrace || (HIP_PROFILE_API || (COMPILE_HIP_DB && (HIP_TRACE_API & (1<<TRACE_ALL))))) {\
std::string apiStr = std::string(__func__) + " (" + ToString(__VA_ARGS__) + ')';\
std::string fullStr;\
recordApiTrace(&fullStr, apiStr);\
hipApiStartTick = recordApiTrace(&fullStr, apiStr);\
if (HIP_PROFILE_API == 0x1) {MARKER_BEGIN(__func__, "HIP") }\
else if (HIP_PROFILE_API == 0x2) {MARKER_BEGIN(fullStr.c_str(), "HIP"); }\
}\
}
#else
// Swallow API_TRACE
#define API_TRACE(IS_CMD, ...)\
@@ -302,7 +321,10 @@ extern void recordApiTrace(std::string *fullStr, const std::string &apiStr);
tls_lastHipError = localHipStatus;\
\
if ((COMPILE_HIP_TRACE_API & 0x2) && HIP_TRACE_API & (1<<TRACE_ALL)) {\
fprintf(stderr, " %ship-api tid:%d.%lu %-30s ret=%2d (%s)>>%s\n", (localHipStatus == 0) ? API_COLOR:KRED, tls_tidInfo.tid(),tls_tidInfo.apiSeqNum(), __func__, localHipStatus, ihipErrorString(localHipStatus), API_COLOR_END);\
auto ticks = getTicks() - hipApiStartTick;\
fprintf(stderr, " %ship-api tid:%d.%lu %-30s ret=%2d (%s)>> +%lu ns%s\n", \
(localHipStatus == 0) ? API_COLOR:KRED, tls_tidInfo.tid(),tls_tidInfo.apiSeqNum(), \
__func__, localHipStatus, ihipErrorString(localHipStatus), ticks, API_COLOR_END);\
}\
if (HIP_PROFILE_API) { MARKER_END(); }\
localHipStatus;\
@@ -371,6 +393,12 @@ class FakeMutex
void unlock() { }
};
#if EVENT_THREAD_SAFE
typedef std::mutex EventMutex;
#else
#warning "Stream thread-safe disabled"
typedef FakeMutex EventMutex;
#endif
#if STREAM_THREAD_SAFE
typedef std::mutex StreamMutex;
@@ -521,11 +549,11 @@ public:
hc::accelerator_view* locked_getAv() { LockedAccessor_StreamCrit_t crit(_criticalData); return &(crit->_av); };
void locked_streamWaitEvent(hipEvent_t event);
void locked_recordEvent(hipEvent_t event);
void locked_streamWaitEvent(ihipEventData_t & event);
hc::completion_future locked_recordEvent(hipEvent_t event);
bool locked_eventIsReady(hipEvent_t event);
void locked_eventWaitComplete(hipEvent_t event, hc::hcWaitMode waitMode);
void locked_eventWaitComplete(hc::completion_future &marker, hc::hcWaitMode waitMode);
ihipStreamCritical_t &criticalData() { return _criticalData; };
@@ -609,32 +637,76 @@ enum ihipEventType_t {
hipEventTypeStopCommand,
};
struct ihipEventData_t
{
ihipEventData_t() {
_state = hipEventStatusCreated;
_stream = NULL;
_timestamp = 0;
_type = hipEventTypeIndependent;
};
void marker(const hc::completion_future & marker) { _marker = marker; };
hc::completion_future & marker() { return _marker; }
uint64_t timestamp() const { return _timestamp; } ;
ihipEventType_t type() const { return _type; };
ihipEventType_t _type;
hipEventStatus_t _state;
hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual stream when recorded
uint64_t _timestamp; // store timestamp, may be set on host or by marker.
private:
hc::completion_future _marker;
};
//=============================================================================
//class ihipEventCriticalBase_t
template <typename MUTEX_TYPE>
class ihipEventCriticalBase_t : LockedBase<MUTEX_TYPE>
{
public:
ihipEventCriticalBase_t(const ihipEvent_t *parentEvent) :
_parent(parentEvent)
{}
~ihipEventCriticalBase_t() {};
// Keep data in structure so it can be easily copied into snapshots
// (used to reduce lock contention and preserve correct lock order)
ihipEventData_t _eventData;
private:
const ihipEvent_t *_parent;
friend class LockedAccessor<ihipEventCriticalBase_t>;
};
typedef ihipEventCriticalBase_t<EventMutex> ihipEventCritical_t;
typedef LockedAccessor<ihipEventCritical_t> LockedAccessor_EventCrit_t;
// internal hip event structure.
class ihipEvent_t {
public:
ihipEvent_t(unsigned flags);
void attachToCompletionFuture(const hc::completion_future *cf, hipStream_t stream, ihipEventType_t eventType);
void refereshEventStatus();
hc::completion_future & marker() { return _marker; }
void marker(hc::completion_future cf) { _marker = cf; };
std::pair<hipEventStatus_t, uint64_t> refreshEventStatus(); // returns pair <state, timestamp>
bool locked_isReady();
void locked_waitComplete(hc::hcWaitMode waitMode);
uint64_t timestamp() const { return _timestamp; } ;
ihipEventType_t type() const { return _type; };
// Return a copy of the critical state. The critical data is locked during the copy.
ihipEventData_t locked_copyCrit() {
LockedAccessor_EventCrit_t crit(_criticalData);
return _criticalData._eventData;
};
ihipEventCritical_t &criticalData() { return _criticalData; };
public:
hipEventStatus_t _state;
hipStream_t _stream; // Stream where the event is recorded. Null stream is resolved to actual stream when recorded
unsigned _flags;
private:
hc::completion_future _marker;
ihipEventType_t _type;
uint64_t _timestamp; // store timestamp, may be set on host or by marker.
ihipEventCritical_t _criticalData;
friend hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
} ;
@@ -652,7 +724,6 @@ public:
};
~ihipDeviceCriticalBase_t() {
}
// Contexts:
+17 -9
ファイルの表示
@@ -61,11 +61,15 @@ int sharePtr(void *ptr, ihipCtx_t *ctx, bool shareWithAll, unsigned hipFlags)
auto device = ctx->getWriteableDevice();
#if USE_APP_PTR_FOR_CTX
hc::am_memtracker_update(ptr, device->_deviceId, hipFlags, ctx);
#else
hc::am_memtracker_update(ptr, device->_deviceId, hipFlags);
#endif
if (shareWithAll) {
hsa_status_t s = hsa_amd_agents_allow_access(g_deviceCnt+1, g_allAgents, NULL, ptr);
tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt);
tprintf (DB_MEM, " allow access to CPU + all %d GPUs (shareWithAll)\n", g_deviceCnt);
if (s != HSA_STATUS_SUCCESS) {
ret = -1;
}
@@ -122,7 +126,7 @@ void * allocAndSharePtr(const char *msg, size_t sizeBytes, ihipCtx_t *ctx, bool
if (HIP_INIT_ALLOC != -1) {
// TODO , dont' call HIP API directly here:
hipMemset(ptr, HIP_INIT_ALLOC, sizeBytes);
}
}
if (ptr != nullptr) {
int r = sharePtr(ptr, ctx, shareWithAll, hipFlags);
@@ -251,7 +255,7 @@ hipError_t hipMalloc(void** ptr, size_t sizeBytes)
hip_status = hipErrorMemoryAllocation;
}
}
}
return ihipLogStatus(hip_status);
@@ -284,10 +288,10 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
}
const unsigned supportedFlags = hipHostMallocPortable
| hipHostMallocMapped
| hipHostMallocWriteCombined
| hipHostMallocCoherent
const unsigned supportedFlags = hipHostMallocPortable
| hipHostMallocMapped
| hipHostMallocWriteCombined
| hipHostMallocCoherent
| hipHostMallocNonCoherent;
@@ -300,7 +304,7 @@ hipError_t hipHostMalloc(void** ptr, size_t sizeBytes, unsigned int flags)
hip_status = hipErrorInvalidValue;
} else {
auto device = ctx->getWriteableDevice();
unsigned amFlags = 0;
if (flags & hipHostMallocCoherent) {
amFlags = amHostCoherent;
@@ -581,7 +585,7 @@ hipError_t hipMalloc3DArray(hipArray_t *array,
hsa_ext_image_data_info_t imageInfo;
hsa_status_t status = hsa_ext_image_data_get_info(*agent, &imageDescriptor, permission, &imageInfo);
size_t alignment = imageInfo.alignment <= allocGranularity ? 0 : imageInfo.alignment;
*ptr = hip_internal::allocAndSharePtr("device_array", allocSize, ctx, false, am_flags, 0, alignment);
if (size && (*ptr == NULL)) {
@@ -660,7 +664,11 @@ hipError_t hipHostRegister(void *hostPtr, size_t sizeBytes, unsigned int flags)
vecAcc.push_back(ihipGetDevice(i)->_acc);
}
am_status = hc::am_memory_host_lock(device->_acc, hostPtr, sizeBytes, &vecAcc[0], vecAcc.size());
#if USE_APP_PTR_FOR_CTX
hc::am_memtracker_update(hostPtr, device->_deviceId, flags, ctx);
#else
hc::am_memtracker_update(hostPtr, device->_deviceId, flags);
#endif
tprintf(DB_MEM, " %s registered ptr=%p and allowed access to %zu peers\n", __func__, hostPtr, vecAcc.size());
if(am_status == AM_SUCCESS){
+158 -10
ファイルの表示
@@ -27,6 +27,7 @@ THE SOFTWARE.
#include <memory>
#include <mutex>
#include <string>
#include <unordered_map>
#include <vector>
#include <map>
@@ -217,7 +218,7 @@ namespace
using namespace ELFIO;
using namespace std;
static constexpr pair<Elf64_Addr, Elf_Xword> r{0, 0};
static const pair<Elf64_Addr, Elf_Xword> r{0, 0};
for (auto i = 0u; i != section.get_symbols_num(); ++i) {
// TODO: this is boyscout code, caching the temporaries
@@ -374,6 +375,8 @@ hipError_t hipModuleLoad(hipModule_t *module, const char *fname)
hipError_t hipModuleUnload(hipModule_t hmod)
{
HIP_INIT_API(hmod);
// TODO - improve this synchronization so it is thread-safe.
// Currently we want for all inflight activity to complete, but don't prevent another
// thread from launching new kernels before we finish this operation.
@@ -408,7 +411,7 @@ hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char
hipError_t ret = hipSuccess;
if (name == nullptr){
return ihipLogStatus(hipErrorInvalidValue);
return (hipErrorInvalidValue);
}
if (ctx == nullptr){
@@ -431,7 +434,7 @@ hipError_t ihipModuleGetSymbol(hipFunction_t *func, hipModule_t hmod, const char
hsa_executable_symbol_t symbol;
status = hsa_executable_get_symbol(hmod->executable, NULL, name, gpuAgent, 0, &symbol);
if(status != HSA_STATUS_SUCCESS){
return ihipLogStatus(hipErrorNotFound);
return hipErrorNotFound;
}
status = hsa_executable_symbol_get_info(symbol,
@@ -503,11 +506,11 @@ hipError_t ihipModuleLaunchKernel(hipFunction_t f,
if(config[0] == HIP_LAUNCH_PARAM_BUFFER_POINTER && config[2] == HIP_LAUNCH_PARAM_BUFFER_SIZE && config[4] == HIP_LAUNCH_PARAM_END){
kernArgSize = *(size_t*)(config[3]);
} else {
return ihipLogStatus(hipErrorNotInitialized);
return hipErrorNotInitialized;
}
}else{
return ihipLogStatus(hipErrorInvalidValue);
return hipErrorInvalidValue;
}
@@ -611,6 +614,125 @@ hipError_t hipHccModuleLaunchKernel(hipFunction_t f,
sharedMemBytes, hStream, kernelParams, extra, startEvent, stopEvent));
}
namespace
{
struct Agent_global {
std::string name;
hipDeviceptr_t address;
std::uint32_t byte_cnt;
};
inline
void* address(hsa_executable_symbol_t x)
{
void* r = nullptr;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_ADDRESS, &r);
return r;
}
inline
std::string name(hsa_executable_symbol_t x)
{
uint32_t sz = 0u;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME_LENGTH, &sz);
std::string r(sz, '\0');
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_NAME, &r.front());
return r;
}
inline
std::uint32_t size(hsa_executable_symbol_t x)
{
std::uint32_t r = 0;
hsa_executable_symbol_get_info(
x, HSA_EXECUTABLE_SYMBOL_INFO_VARIABLE_SIZE, &r);
return r;
}
inline
void track(const Agent_global& x)
{
tprintf(
DB_MEM,
" add variable '%s' with ptr=%p size=%u to tracker\n",
x.name.c_str(),
x.address,
x.byte_cnt);
auto device = ihipGetTlsDefaultCtx()->getWriteableDevice();
hc::AmPointerInfo ptr_info(
nullptr,
x.address,
x.address,
x.byte_cnt,
device->_acc,
true,
false);
hc::am_memtracker_add(x.address, ptr_info);
hc::am_memtracker_update(x.address, device->_deviceId, 0u);
}
template<typename Container = std::vector<Agent_global>>
inline
hsa_status_t copy_agent_global_variables(
hsa_executable_t, hsa_agent_t, hsa_executable_symbol_t x, void* out)
{
assert(out);
hsa_symbol_kind_t t = {};
hsa_executable_symbol_get_info(x, HSA_EXECUTABLE_SYMBOL_INFO_TYPE, &t);
if (t == HSA_SYMBOL_KIND_VARIABLE) {
static_cast<Container*>(out)->push_back(
Agent_global{name(x), address(x), size(x)});
track(static_cast<Container*>(out)->back());
}
return HSA_STATUS_SUCCESS;
}
inline
hsa_agent_t this_agent()
{
auto ctx = ihipGetTlsDefaultCtx();
if (!ctx) throw std::runtime_error{"No active HIP context."};
auto device = ctx->getDevice();
if (!device) throw std::runtime_error{"No device available for HIP."};
ihipDevice_t *currentDevice = ihipGetDevice(device->_deviceId);
if (!currentDevice) {
throw std::runtime_error{"No active device for HIP"};
}
return currentDevice->_hsaAgent;
}
inline
std::vector<Agent_global> read_agent_globals(hipModule_t hmodule)
{
std::vector<Agent_global> r;
hsa_executable_iterate_agent_symbols(
hmodule->executable, this_agent(), copy_agent_global_variables, &r);
return r;
}
}
hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
hipModule_t hmod, const char* name)
{
@@ -623,11 +745,37 @@ hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
return ihipLogStatus(hipErrorNotInitialized);
}
else{
hipFunction_t func;
ihipModuleGetSymbol(&func, hmod, name);
*bytes = PrintSymbolSizes(hmod->ptr, name) + sizeof(amd_kernel_code_t);
*dptr = reinterpret_cast<void*>(func->_object);
return ihipLogStatus(ret);
static std::unordered_map<
hipModule_t, std::vector<Agent_global>> agent_globals;
// TODO: this is not particularly robust.
if (agent_globals.count(hmod) == 0) {
static std::mutex mtx;
std::lock_guard<std::mutex> lck{mtx};
if (agent_globals.count(hmod) == 0) {
agent_globals.emplace(hmod, read_agent_globals(hmod));
}
}
// TODO: This is unsafe iff some other emplacement triggers rehashing.
// It will have to be properly fleshed out in the future.
const auto it0 = agent_globals.find(hmod);
if (it0 == agent_globals.cend()) {
throw std::runtime_error{"agent_globals data structure corrupted."};
}
const auto it1 = std::find_if(
it0->second.cbegin(),
it0->second.cend(),
[=](const Agent_global& x) { return x.name == name; });
if (it1 == it0->second.cend()) return ihipLogStatus(hipErrorNotFound);
*dptr = it1->address;
*bytes = it1->byte_cnt;
return ihipLogStatus(hipSuccess);
}
}
+9 -5
ファイルの表示
@@ -93,18 +93,23 @@ hipError_t hipStreamWaitEvent(hipStream_t stream, hipEvent_t event, unsigned int
hipError_t e = hipSuccess;
auto ecd = event->locked_copyCrit();
if (event == nullptr) {
e = hipErrorInvalidResourceHandle;
} else if (event->_state != hipEventStatusUnitialized) {
} else if ((ecd._state != hipEventStatusUnitialized) &&
(ecd._state != hipEventStatusCreated)) {
if (HIP_SYNC_STREAM_WAIT || (HIP_SYNC_NULL_STREAM && (stream == 0))) {
// conservative wait on host for the specified event to complete:
event->locked_waitComplete((event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive);
// return _stream->locked_eventWaitComplete(this, waitMode);
//
ecd._stream->locked_eventWaitComplete(ecd.marker(), (event->_flags & hipEventBlockingSync) ? hc::hcWaitModeBlocked : hc::hcWaitModeActive);
} else {
stream = ihipSyncAndResolveStream(stream);
// This will user create_blocking_marker to wait on the specified queue.
stream->locked_streamWaitEvent(event);
// This will use create_blocking_marker to wait on the specified queue.
stream->locked_streamWaitEvent(ecd);
}
} // else event not recorded, return immediately and don't create marker.
@@ -140,7 +145,6 @@ hipError_t hipStreamQuery(hipStream_t stream)
//---
hipError_t hipStreamSynchronize(hipStream_t stream)
{
HIP_INIT_API(stream);
HIP_INIT_SPECIAL_API(TRACE_SYNC, stream);
hipError_t e = hipSuccess;
+12
ファイルの表示
@@ -75,6 +75,18 @@ RUN: %t CMAKE_TEST_NAME <arguments_to_test_executable> EXCLUDE_HIP_PLATFORM <hcc
```
#### CMAKECMD command
The supported syntax for the CMAKECMD command is:
```
CMAKECMD: <cmake_command> <options_to_cmake_command>
```
cmake_command: refers to any of the commands supported by ```cmake -E``` as specified in the [cmake documentation](https://cmake.org/cmake/help/latest/manual/cmake.1.html#command-line-tool-mode). Note that the commands are limited by the version of cmake the user is running.
options_to_cmake_command: refers to the arguments supported by the specific cmake_command. The arguments are parsed by HIT to replace special markers. The markers supported by HIT are:
%S: Refers to the source directory containing the current source file.
%B: Refers to the build directory for the current cmake project i.e. CMAKE_CURRENT_BINARY_DIR.
### Running tests:
```
ctest
+18
ファイルの表示
@@ -0,0 +1,18 @@
// RUN: %run_test hipify "%s" "%t" %cuda_args
#pragma once
#include <cuda_runtime.h>
/**
* Allocate GPU memory for `count` elements of type `T`.
*/
template<typename T>
static T* gpuMalloc(size_t count) {
T* ret = nullptr;
// CHECK: hipMalloc(&ret, count * sizeof(T));
cudaMalloc(&ret, count * sizeof(T));
return ret;
}
+7
ファイルの表示
@@ -2,6 +2,8 @@
#include <iostream>
// CHECK: #include <hip/hip_runtime.h>
#include <cuda.h>
#define TOKEN_PASTE(X, Y) X ## Y
#define ARG_LIST_AS_MACRO a, device_x, device_y
@@ -33,8 +35,13 @@ int main(int argc, char* argv[]) {
// CHECK: hipMalloc(&device_x, kDataLen * sizeof(float));
cudaMalloc(&device_x, kDataLen * sizeof(float));
#ifdef HERRING
// CHECK: hipMalloc(&device_y, kDataLen * sizeof(float));
cudaMalloc(&device_y, kDataLen * sizeof(float));
#else
// CHECK: hipMalloc(&device_y, kDataLen * sizeof(double));
cudaMalloc(&device_y, kDataLen * sizeof(double));
#endif
// CHECK: hipMemcpy(device_x, host_x, kDataLen * sizeof(float), hipMemcpyHostToDevice);
cudaMemcpy(device_x, host_x, kDataLen * sizeof(float), cudaMemcpyHostToDevice);
+2
ファイルの表示
@@ -99,6 +99,7 @@ int main(int argc, char **argv)
// use command-line specified CUDA device, otherwise use device with highest Gflops/s
cuda_device = findCudaDevice(argc, (const char **)argv);
// CHECK: hipDeviceProp_t deviceProp;
cudaDeviceProp deviceProp;
// CHECK: checkCudaErrors(hipGetDevice(&cuda_device));
checkCudaErrors(cudaGetDevice(&cuda_device));
@@ -135,6 +136,7 @@ int main(int argc, char **argv)
checkCudaErrors(cudaStreamCreate(&(streams[i])));
}
// CHECK: hipEvent_t start_event, stop_event;
// create CUDA event handles
cudaEvent_t start_event, stop_event;
+14
ファイルの表示
@@ -155,6 +155,20 @@ macro(HIT_ADD_FILES _dir _label _parent)
endif()
endforeach()
# Run cmake commands
execute_process(COMMAND ${HIP_SRC_PATH}/tests/hit/parser --cmakeCMDs ${file}
OUTPUT_VARIABLE _contents
ERROR_QUIET
WORKING_DIRECTORY ${_dir}
OUTPUT_STRIP_TRAILING_WHITESPACE)
string(REGEX REPLACE "\n" ";" _contents "${_contents}")
string(REGEX REPLACE "%S" ${_dir} _contents "${_contents}")
string(REGEX REPLACE "%B" ${CMAKE_CURRENT_BINARY_DIR} _contents "${_contents}")
foreach(_cmd ${_contents})
string(REGEX REPLACE " " ";" _cmd "${_cmd}")
execute_process(COMMAND ${CMAKE_COMMAND} -E ${_cmd})
endforeach()
# Add tests
execute_process(COMMAND ${HIP_SRC_PATH}/tests/hit/parser --runCMDs ${file}
OUTPUT_VARIABLE _contents
+21 -6
ファイルの表示
@@ -8,7 +8,7 @@ use File::Spec;
sub parse_file {
my $file = shift;
(my $exe = $file) =~ s/\.[^.]+$//g;
my (@buildCMDs, @runCMDs, @runNamedCMDs);
my (@buildCMDs, @runCMDs, @runNamedCMDs, @cmakeCMDs);
if (open (SOURCE, '<:encoding(UTF-8)', "$file")) {
while (<SOURCE>) {
my $line=$_;
@@ -36,10 +36,17 @@ sub parse_file {
$line =~ s/\R//g; # Remove line endings
push @runNamedCMDs, $line;
}
# Look for CMAKECMD instructions
if ($line =~ /^ \* CMAKECMD:/) {
$line =~ s/^ \* CMAKECMD: //g; # Remove " * CMAKECMD: "
# Substitute %S -> srcdir and %B -> builddir happens in cmake
$line =~ s/\R//g; # Remove line endings
push @cmakeCMDs, $line;
}
}
close(SOURCE);
}
return (\@buildCMDs, \@runCMDs, \@runNamedCMDs);
return (\@buildCMDs, \@runCMDs, \@runNamedCMDs, \@cmakeCMDs);
}
# Exit if no arguments specified
@@ -53,8 +60,9 @@ my @options = ();
my $retBuildCMDs = 0;
my $retRunCMDs = 0;
my $retRunNamedCMDs = 0;
my $retCmakeCMDs = 0;
foreach $arg (@ARGV) {
if ($retBuildCMDs or $retRunCMDs or $retRunNamedCMDs) {
if ($retBuildCMDs or $retRunCMDs or $retRunNamedCMDs or $retCmakeCMDs) {
push (@options, $arg);
}
if ($arg eq '--buildCMDs') {
@@ -66,18 +74,21 @@ foreach $arg (@ARGV) {
if ($arg eq '--runNamedCMDs') {
$retRunNamedCMDs = 1;
}
if ($arg eq '--cmakeCMDs') {
$retCmakeCMDs = 1;
}
}
# Atleast one command needs to be specified
if (($retBuildCMDs eq 0) and ($retRunCMDs eq 0) and ($retRunNamedCMDs eq 0)) {
die "Usage: $0 <--buildCMDs|--runCMDs|--runNamedCMDs> FILENAMEs\n";
if (($retBuildCMDs eq 0) and ($retRunCMDs eq 0) and ($retRunNamedCMDs eq 0) and ($retCmakeCMDs eq 0)) {
die "Usage: $0 <--buildCMDs|--runCMDs|--runNamedCMDs|--cmakeCMDs> FILENAMEs\n";
}
# Iterate over input files
foreach $file (@options) {
# Convert absolute path to path relative to working directory
my $relfile = File::Spec->abs2rel($file);
my ($buildCMDs, $runCMDs, $runNamedCMDs) = parse_file("$relfile");
my ($buildCMDs, $runCMDs, $runNamedCMDs, $cmakeCMDs) = parse_file("$relfile");
if ($retBuildCMDs) {
# print "BuildCMDs:\n";
print "$_\n" for @$buildCMDs;
@@ -90,6 +101,10 @@ foreach $file (@options) {
# print "RunNamedCMDs:\n";
print "$_\n" for @$runNamedCMDs;
}
if ($retCmakeCMDs) {
# print "CmakeCMDs:\n";
print "$_\n" for @$cmakeCMDs;
}
}
# vim: ts=4:sw=4:expandtab:smartindent
+1 -1
ファイルの表示
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define NUM_STREAMS 2
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
if(tx == 0){
for(int i = 0; i<num;i++){
+1 -2
ファイルの表示
@@ -46,7 +46,6 @@ int main(int argc, char *argv[])
A_h = new char[Nbytes];
HIPCHECK ( hipMalloc((void **) &A_d, Nbytes) );
A_h = (char*)malloc(Nbytes);
printf ("Size=%zu memsetval=%2x \n", Nbytes, memsetval);
HIPCHECK ( hipMemsetD8(A_d, memsetval, Nbytes) );
@@ -61,7 +60,7 @@ int main(int argc, char *argv[])
}
hipFree((void *) A_d);
free(A_h);
delete [] A_h;
passed();
}
+1 -1
ファイルの表示
@@ -27,7 +27,7 @@ THE SOFTWARE.
#define SIZE 64<<2
__global__ void getSqAbs(hipLaunchParm lp, float *A, float *B, float *C){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
C[tx] = hipCsqabsf(make_hipFloatComplex(A[tx], B[tx]));
}
+3 -3
ファイルの表示
@@ -16,13 +16,13 @@
__global__ void cpy(hipLaunchParm lp, uint32_t *Out, uint32_t *In)
{
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
memcpy(Out + tx, In + tx, sizeof(uint32_t));
}
__global__ void set(hipLaunchParm lp, uint32_t *ptr, uint8_t val, size_t size)
{
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
memset(ptr + tx, val, (sizeof(uint32_t)*(size/LEN)));
}
@@ -58,6 +58,6 @@ int main()
return 0;
}
}
passed();
}
+1 -1
ファイルの表示
@@ -35,7 +35,7 @@ THE SOFTWARE.
__global__ void floatMath(hipLaunchParm lp, float *In, float *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Out[tid] = __cosf(In[tid]);
Out[tid] = __exp10f(Out[tid]);
Out[tid] = __expf(Out[tid]);
+2 -2
ファイルの表示
@@ -217,7 +217,7 @@ int computeGold(int *gpuData, const int len)
__global__ void testKernel(hipLaunchParm lp,int *g_odata)
{
// access thread id
const unsigned int tid = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
const unsigned int tid = blockDim.x * blockIdx.x + threadIdx.x;
// Test various atomic instructions
@@ -240,7 +240,7 @@ __global__ void testKernel(hipLaunchParm lp,int *g_odata)
// Atomic increment (modulo 17+1)
atomicInc((unsigned int *)&g_odata[5], 17);
// Atomic decrement
atomicDec((unsigned int *)&g_odata[6], 137);
+140 -33
ファイルの表示
@@ -32,82 +32,82 @@ THE SOFTWARE.
#define SIZE N*sizeof(float)
__global__ void test_sincosf(hipLaunchParm lp, float* a, float* b, float *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
sincosf(a[tid], b+tid, c+tid);
}
__global__ void test_sincospif(hipLaunchParm lp, float* a, float* b, float *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
sincospif(a[tid], b+tid, c+tid);
}
__global__ void test_fdividef(hipLaunchParm lp, float *a, float* b, float *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
c[tid] = fdividef(a[tid], b[tid]);
}
__global__ void test_llrintf(hipLaunchParm lp, float *a, long long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = llrintf(a[tid]);
}
__global__ void test_lrintf(hipLaunchParm lp, float *a, long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = lrintf(a[tid]);
}
__global__ void test_rintf(hipLaunchParm lp, float *a, float *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = rintf(a[tid]);
}
__global__ void test_llroundf(hipLaunchParm lp, float *a, long long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = llroundf(a[tid]);
}
__global__ void test_lroundf(hipLaunchParm lp, float *a, long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = lroundf(a[tid]);
}
__global__ void test_rhypotf(hipLaunchParm lp, float *a, float* b, float *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
c[tid] = rhypotf(a[tid], b[tid]);
}
__global__ void test_norm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
d[tid] = norm3df(a[tid], b[tid], c[tid]);
}
__global__ void test_norm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
e[tid] = norm4df(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_normf(hipLaunchParm lp, float *a, float *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = normf(N, a);
}
__global__ void test_rnorm3df(hipLaunchParm lp, float *a, float* b, float *c, float *d){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
d[tid] = rnorm3df(a[tid], b[tid], c[tid]);
}
__global__ void test_rnorm4df(hipLaunchParm lp, float *a, float* b, float *c, float *d, float *e){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
e[tid] = rnorm4df(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnormf(hipLaunchParm lp, float *a, float *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = rnormf(N, a);
}
__global__ void test_erfinvf(hipLaunchParm lp, float *a, float *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = erff(erfinvf(a[tid]));
}
@@ -139,7 +139,14 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
if(passed == 1){
return true;
}
@@ -174,7 +181,14 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
if(passed == 1){
return true;
}
@@ -205,7 +219,13 @@ for(int i=0;i<512;i++){
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
if(passed == 1){
return true;
}
@@ -234,7 +254,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -263,7 +288,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -291,7 +321,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -321,7 +356,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -350,7 +390,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -387,7 +432,16 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
delete [] D;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
hipFree(Dd);
if(passed == 1){
return true;
}
@@ -427,7 +481,18 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
delete [] D;
delete [] E;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
hipFree(Dd);
hipFree(Ed);
if(passed == 1){
return true;
}
@@ -457,7 +522,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -489,7 +559,14 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
if(passed == 1){
return true;
}
@@ -525,7 +602,16 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
delete [] D;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
hipFree(Dd);
if(passed == 1){
return true;
}
@@ -565,7 +651,18 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
delete [] D;
delete [] E;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
hipFree(Dd);
hipFree(Ed);
if(passed == 1){
return true;
}
@@ -595,7 +692,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -622,7 +724,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -631,7 +738,7 @@ return false;
}
int main(){
if(run_sincosf() && run_sincospif() && run_fdividef() &&
if(run_sincosf() && run_sincospif() && run_fdividef() &&
run_llrintf() && run_norm3df() && run_norm4df() &&
run_normf() && run_rnorm3df() && run_rnorm4df() &&
run_rnormf() && run_lroundf() && run_llroundf() &&
+124 -28
ファイルの表示
@@ -32,72 +32,72 @@ THE SOFTWARE.
#define SIZE N*sizeof(double)
__global__ void test_sincos(hipLaunchParm lp, double* a, double* b, double *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
sincos(a[tid], b+tid, c+tid);
}
__global__ void test_sincospi(hipLaunchParm lp, double* a, double* b, double *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
sincospi(a[tid], b+tid, c+tid);
}
__global__ void test_llrint(hipLaunchParm lp, double *a, long long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = llrint(a[tid]);
}
__global__ void test_lrint(hipLaunchParm lp, double *a, long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = lrint(a[tid]);
}
__global__ void test_rint(hipLaunchParm lp, double *a, double *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = rint(a[tid]);
}
__global__ void test_llround(hipLaunchParm lp, double *a, long long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = llround(a[tid]);
}
__global__ void test_lround(hipLaunchParm lp, double *a, long int *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = lround(a[tid]);
}
__global__ void test_rhypot(hipLaunchParm lp, double *a, double* b, double *c){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
c[tid] = rhypot(a[tid], b[tid]);
}
__global__ void test_norm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
d[tid] = norm3d(a[tid], b[tid], c[tid]);
}
__global__ void test_norm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
e[tid] = norm4d(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnorm3d(hipLaunchParm lp, double *a, double* b, double *c, double *d){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
d[tid] = rnorm3d(a[tid], b[tid], c[tid]);
}
__global__ void test_rnorm4d(hipLaunchParm lp, double *a, double* b, double *c, double *d, double *e){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
e[tid] = rnorm4d(a[tid], b[tid], c[tid], d[tid]);
}
__global__ void test_rnorm(hipLaunchParm lp, double *a, double *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = rnorm(N, a);
}
__global__ void test_erfinv(hipLaunchParm lp, double *a, double *b){
int tid = hipThreadIdx_x;
int tid = threadIdx.x;
b[tid] = erf(erfinv(a[tid]));
}
@@ -128,7 +128,14 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
if(passed == 1){
return true;
}
@@ -163,7 +170,14 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
if(passed == 1){
return true;
}
@@ -193,7 +207,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -221,7 +240,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -249,7 +273,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -278,7 +307,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -306,7 +340,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -343,7 +382,16 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
delete [] D;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
hipFree(Dd);
if(passed == 1){
return true;
}
@@ -383,7 +431,18 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
delete [] D;
delete [] E;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
hipFree(Dd);
hipFree(Ed);
if(passed == 1){
return true;
}
@@ -416,7 +475,14 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
if(passed == 1){
return true;
}
@@ -452,7 +518,16 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
delete [] D;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
hipFree(Dd);
if(passed == 1){
return true;
}
@@ -492,7 +567,18 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
delete [] C;
delete [] D;
delete [] E;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
hipFree(Dd);
hipFree(Ed);
if(passed == 1){
return true;
}
@@ -522,7 +608,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
@@ -549,7 +640,12 @@ for(int i=0;i<512;i++){
passed = 1;
}
}
free(A);
delete [] A;
delete [] B;
hipFree(Ad);
hipFree(Bd);
if(passed == 1){
return true;
}
+1 -1
ファイルの表示
@@ -36,7 +36,7 @@ __device__ int globalOut[NUM];
__global__ void Assign(hipLaunchParm lp, int* Out)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Out[tid] = globalIn[tid];
globalOut[tid] = globalIn[tid];
}
+22 -15
ファイルの表示
@@ -19,16 +19,17 @@ THE SOFTWARE.
#include <iostream>
#include <hip/hip_fp16.h>
#include "hip/hip_runtime_api.h"
#include "hip/hip_runtime.h"
#include "test_common.h"
#define LEN 64
#define HALF_SIZE 64*sizeof(__half)
#define HALF2_SIZE 64*sizeof(__half2)
#if __HIP_ARCH_GFX803__ > 0
#if __HIP_ARCH_GFX803__ || __HIP_ARCH_GFX900__
__global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
__half a = A[tx];
__half b = B[tx];
__half c = C[tx];
@@ -44,7 +45,7 @@ __global__ void __halfMath(hipLaunchParm lp, __half *A, __half *B, __half *C) {
}
__global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C) {
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
__half2 a = A[tx];
__half2 b = B[tx];
__half2 c = C[tx];
@@ -61,15 +62,21 @@ __global__ void __half2Math(hipLaunchParm lp, __half2 *A, __half2 *B, __half2 *C
#endif
int main(){
__half *A, *B, *C;
hipMalloc(&A, HALF_SIZE);
hipMalloc(&B, HALF_SIZE);
hipMalloc(&C, HALF_SIZE);
hipLaunchKernel(__halfMath, dim3(1,1,1), dim3(LEN,1,1), 0, 0, A, B, C);
__half2 *A2, *B2, *C2;
hipMalloc(&A, HALF2_SIZE);
hipMalloc(&B, HALF2_SIZE);
hipMalloc(&C, HALF2_SIZE);
hipLaunchKernel(__half2Math, dim3(1,1,1), dim3(LEN,1,1), 0, 0, A2, B2, C2);
__half *A, *B, *C;
hipMalloc(&A, HALF_SIZE);
hipMalloc(&B, HALF_SIZE);
hipMalloc(&C, HALF_SIZE);
hipLaunchKernel(__halfMath, dim3(1,1,1), dim3(LEN,1,1), 0, 0, A, B, C);
hipFree(A);
hipFree(B);
hipFree(C);
__half2 *A2, *B2, *C2;
hipMalloc(&A2, HALF2_SIZE);
hipMalloc(&B2, HALF2_SIZE);
hipMalloc(&C2, HALF2_SIZE);
hipLaunchKernel(__half2Math, dim3(1,1,1), dim3(LEN,1,1), 0, 0, A2, B2, C2);
hipFree(A2);
hipFree(B2);
hipFree(C2);
passed();
}
+1 -1
ファイルの表示
@@ -33,7 +33,7 @@ THE SOFTWARE.
__global__ void vAdd(hipLaunchParm lp, float *In1, float *In2, float *In3, float *In4, float *Out)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
In4[tid] = In1[tid] + In2[tid];
__threadfence();
In3[tid] = In3[tid] + In4[tid];
+4 -4
ファイルの表示
@@ -37,9 +37,9 @@ __global__ void
warpvote(hipLaunchParm lp, int* device_any, int* device_all , int Num_Warps_per_Block, int pshift)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
device_any[hipThreadIdx_x>>pshift] = __any(tid -77);
device_all[hipThreadIdx_x>>pshift] = __all(tid -77);
int tid = threadIdx.x + blockIdx.x * blockDim.x;
device_any[threadIdx.x>>pshift] = __any(tid -77);
device_all[threadIdx.x>>pshift] = __all(tid -77);
}
int main(int argc, char *argv[])
@@ -49,7 +49,7 @@ int main(int argc, char *argv[])
warpSize = devProp.warpSize;
int w = warpSize;
pshift = 0;
pshift = 0;
while (w >>= 1) ++pshift;
printf ("warpSize=%d pshift=%d\n", warpSize, pshift);
+4 -4
ファイルの表示
@@ -34,12 +34,12 @@ __global__ void
gpu_ballot(hipLaunchParm lp, unsigned int* device_ballot, int Num_Warps_per_Block,int pshift)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
const unsigned int warp_num = hipThreadIdx_x >> pshift;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
const unsigned int warp_num = threadIdx.x >> pshift;
#ifdef __HIP_PLATFORM_HCC__
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popcll(__ballot(tid - 245)));
#else
atomicAdd(&device_ballot[warp_num+hipBlockIdx_x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
atomicAdd(&device_ballot[warp_num+blockIdx.x*Num_Warps_per_Block],__popc(__ballot(tid - 245)));
#endif
}
+2 -2
ファイルの表示
@@ -72,8 +72,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned long long int* c, unsigned long long int* d, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
ファイルの表示
@@ -83,8 +83,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
ファイルの表示
@@ -73,8 +73,8 @@ HIP_kernel(hipLaunchParm lp,
int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
ファイルの表示
@@ -64,8 +64,8 @@ HIP_kernel(hipLaunchParm lp,
unsigned int* a, unsigned int* b,unsigned int* c, unsigned long long int* d, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
+2 -2
ファイルの表示
@@ -57,8 +57,8 @@ vectoradd_float(hipLaunchParm lp,
T* a, const T* bm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
+35 -35
ファイルの表示
@@ -40,13 +40,13 @@ THE SOFTWARE.
#define THREADS_PER_BLOCK_Z 1
__global__ void
__global__ void
vectoradd_char1(hipLaunchParm lp,
char1* a, const char1* bm, const char1* cm, int width, int height)
char1* a, const char1* bm, const char1* cm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -54,40 +54,40 @@ vectoradd_char1(hipLaunchParm lp,
}
}
__global__ void
__global__ void
vectoradd_char2(hipLaunchParm lp,
char2* a, const char2* bm, const char2* cm, int width, int height)
char2* a, const char2* bm, const char2* cm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
a[i] = make_char2(bm[i].x, bm[i].y) + make_char2(cm[i].x, cm[i].y);
}
}
}
__global__ void
__global__ void
vectoradd_char3(hipLaunchParm lp,
char3* a, const char3* bm, const char3* cm, int width, int height)
char3* a, const char3* bm, const char3* cm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
a[i] = make_char3(bm[i].x, bm[i].y, bm[i].z) + make_char3(cm[i].x, cm[i].y, cm[i].z);
}
}
__global__ void
__global__ void
vectoradd_char4(hipLaunchParm lp,
char4* a, const char4* bm, const char4* cm, int width, int height)
char4* a, const char4* bm, const char4* cm, int width, int height)
{
int x = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int y = hipBlockDim_y * hipBlockIdx_y + hipThreadIdx_y;
int x = blockDim.x * blockIdx.x + threadIdx.x;
int y = blockDim.y * blockIdx.y + threadIdx.y;
int i = y * width + x;
if ( i < (width * height)) {
@@ -99,7 +99,7 @@ vectoradd_char4(hipLaunchParm lp,
#if 0
__kernel__ void vectoradd_float(float* a, const float* b, const float* c, int width, int height) {
int x = blockDimX * blockIdx.x + threadIdx.x;
int y = blockDimY * blockIdy.y + threadIdx.y;
@@ -128,21 +128,21 @@ bool dataTypesRun(){
hostA = (T*)malloc(NUM * sizeof(T));
hostB = (T*)malloc(NUM * sizeof(T));
hostC = (T*)malloc(NUM * sizeof(T));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (T)i;
hostC[i] = (T)i;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
@@ -192,21 +192,21 @@ bool dataTypesRun(){
hostA = (T*)malloc(NUM * sizeof(T));
hostB = (T*)malloc(NUM * sizeof(T));
hostC = (T*)malloc(NUM * sizeof(T));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (T)i;
hostC[i] = (T)i;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
@@ -256,21 +256,21 @@ bool dataTypesRun(){
hostA = (T*)malloc(NUM * sizeof(T));
hostB = (T*)malloc(NUM * sizeof(T));
hostC = (T*)malloc(NUM * sizeof(T));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (T)i;
hostC[i] = (T)i;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
@@ -319,21 +319,21 @@ bool dataTypesRunChar4(){
hostA = (T*)malloc(NUM * sizeof(T));
hostB = (T*)malloc(NUM * sizeof(T));
hostC = (T*)malloc(NUM * sizeof(T));
// initialize the input data
for (i = 0; i < NUM; i++) {
hostB[i] = (T)i;
hostC[i] = (T)i;
}
HIP_ASSERT(hipMalloc((void**)&deviceA, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceB, NUM * sizeof(T)));
HIP_ASSERT(hipMalloc((void**)&deviceC, NUM * sizeof(T)));
HIP_ASSERT(hipMemcpy(deviceB, hostB, NUM*sizeof(T), hipMemcpyHostToDevice));
HIP_ASSERT(hipMemcpy(deviceC, hostC, NUM*sizeof(T), hipMemcpyHostToDevice));
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
hipLaunchKernel(HIP_KERNEL_NAME(vectoradd_char1),
dim3(WIDTH/THREADS_PER_BLOCK_X, HEIGHT/THREADS_PER_BLOCK_Y),
dim3(THREADS_PER_BLOCK_X, THREADS_PER_BLOCK_Y),
0, 0,
@@ -368,7 +368,7 @@ bool dataTypesRunChar4(){
}
int main() {
hipDeviceProp_t devProp;
hipGetDeviceProperties(&devProp, 0);
cout << " System minor " << devProp.minor << endl;
+1 -1
ファイルの表示
@@ -36,7 +36,7 @@ THE SOFTWARE.
#define SIZE LEN<<2
__global__ void kernel_trig(hipLaunchParm lp, float *In, float *sin_d, float *cos_d, float *tan_d, float *sin_pd, float *cos_pd){
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
sin_d[tid] = __sinf(In[tid]);
cos_d[tid] = __cosf(In[tid]);
tan_d[tid] = __tanf(In[tid]);
+1 -1
ファイルの表示
@@ -29,7 +29,7 @@ THE SOFTWARE.
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd, size_t len)
{
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx < len)
{
Cd[tx] = Ad[tx] + Bd[tx];
+9 -3
ファイルの表示
@@ -30,23 +30,29 @@ THE SOFTWARE.
__global__ void Kern(hipLaunchParm lp, float *A)
{
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
A[tx] += 1.0f;
}
int main()
{
float *A, *Ad;
float A[len];
float *Ad;
for(int i=0;i<len;i++)
{
A[i] = 1.0f;
}
Ad = (float*)mallocHip(size);
memcpyHipH2D(Ad, A, size);
hipLaunchKernel(HIP_KERNEL_NAME(Kern), dim3(len/1024), dim3(1024), 0, 0, A);
hipLaunchKernel(
HIP_KERNEL_NAME(Kern), dim3(len/1024), dim3(1024), 0, 0, Ad);
memcpyHipD2H(A, Ad, size);
for(int i=0;i<len;i++)
{
assert(A[i] == 2.0f);
}
hipFree(Ad);
}
+1 -1
ファイルの表示
@@ -33,7 +33,7 @@ class memManager;
template<typename T>
__global__ void Add(hipLaunchParm lp, T* Ad, T* Bd, T* Cd, size_t Len)
{
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx < Len)
{
Cd[tx] = Ad[tx] + Bd[tx];
+1 -1
ファイルの表示
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define SIZE 1024*1024*sizeof(int)
__global__ void Iter(hipLaunchParm lp, int *Ad){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx == 0){
for(int i=0;i<ITER;i++){
Ad[tx] += 1;
+1 -1
ファイルの表示
@@ -29,7 +29,7 @@ THE SOFTWARE.
#define SIZE 1024*1024*sizeof(int)
__global__ void Iter(hipLaunchParm lp, int *Ad){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
if(tx == 0){
for(int i=0;i<ITER;i++){
Ad[tx] += 1;
+1 -1
ファイルの表示
@@ -2,7 +2,7 @@
#include "hip/hip_runtime_api.h"
__global__ void Kernel(hipLaunchParm lp, float *Ad){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tx] += Ad[tx-1];
}
+2 -2
ファイルの表示
@@ -40,8 +40,8 @@ __global__ void testExternSharedKernel(hipLaunchParm lp, const T* A_d, const T*
T *sdata = reinterpret_cast<T *>(my_sdata);
#endif
size_t gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t tid = hipThreadIdx_x;
size_t gid = (blockIdx.x * blockDim.x + threadIdx.x);
size_t tid = threadIdx.x;
// initialize dynamic shared memory
if (tid < groupElements) {
+1 -1
ファイルの表示
@@ -34,7 +34,7 @@ THE SOFTWARE.
__global__ void vectorAdd(hipLaunchParm lp, float *Ad, float *Bd) {
HIP_DYNAMIC_SHARED(float, sBd);
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
for(int i=0;i<LEN/64;i++) {
sBd[tx + i * 64] = Ad[tx + i * 64] + 1.0f;
Bd[tx + i * 64] = sBd[tx + i * 64];
+4 -3
ファイルの表示
@@ -48,11 +48,12 @@ vectorADD2( hipLaunchParm lp,
T *C_d,
size_t N)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] + B_d[i] ;
double foo = __hiloint2double(A_d[i], B_d[i]);
C_d[i] = __double2loint(foo) + __double2hiint(foo);//A_d[i] + B_d[i] ;
}
}
+6 -6
ファイルの表示
@@ -62,7 +62,7 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
{
//KERNELBEGIN;
unsigned gid = hipThreadIdx_x;
unsigned gid = threadIdx.x;
if (gid < N) {
c[gid] = a[gid] + PlusOne(b[gid]);
}
@@ -74,8 +74,8 @@ __global__ void MyKernel (const hipLaunchParm lp, const float *a, const float *b
void callMyKernel()
{
float *a, *b, *c;
unsigned N;
const unsigned blockSize = 256;
unsigned N = blockSize;
hipLaunchKernel(MyKernel, dim3(N/blockSize), dim3(blockSize), 0, 0, a,b,c,N);
}
@@ -96,13 +96,13 @@ vectorADD(const hipLaunchParm lp,
int zuzu = deviceVar + 1;
int b = hipThreadIdx_x;
int b = threadIdx.x;
int c;
#ifdef NOT_YET
int a = __shfl_up(x, 1);
#endif
float x;
float x = 1.0;
float z = sin(x);
#ifdef NOT_YET
float fastZ = __sin(x);
@@ -119,8 +119,8 @@ vectorADD(const hipLaunchParm lp,
__syncthreads();
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (size_t i=offset; i<N; i+=stride) {
C_d[i] = A_d[i] + B_d[i];
+38
ファイルの表示
@@ -0,0 +1,38 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* RUN: %t
* HIT_END
*/
#define HIP_ENABLE_PRINTF
#include"test_common.h"
__global__ void run_printf(hipLaunchParm lp){
printf("Hello World\n");
}
int main(){
hipLaunchKernel(HIP_KERNEL_NAME(run_printf), dim3(1), dim3(1), 0, 0);
hipDeviceSynchronize();
passed();
}
+2 -2
ファイルの表示
@@ -18,7 +18,7 @@ THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../test_common.cpp
* BUILD: %t %s ../test_common.cpp
* RUN: %t
* HIT_END
*/
@@ -38,7 +38,7 @@ __constant__ int Value[LEN];
__global__ void Get(hipLaunchParm lp, int *Ad)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tid] = Value[tid];
}
+2 -2
ファイルの表示
@@ -33,12 +33,12 @@ THE SOFTWARE.
#define SIZE NUM * 8
__global__ void Alloc(hipLaunchParm lp, uint64_t *Ptr) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
Ptr[tid] = (uint64_t)malloc(128);
}
__global__ void Free(hipLaunchParm lp, uint64_t *Ptr) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
free((void*)Ptr[tid]);
}
+40 -25
ファイルの表示
@@ -35,52 +35,52 @@ THE SOFTWARE.
#define LEN12 12 * 4
__global__ void MemCpy8(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memcpy(Out + tid*8, In + tid*8, 8);
}
__global__ void MemCpy9(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memcpy(Out + tid*9, In + tid*9, 9);
}
__global__ void MemCpy10(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memcpy(Out + tid*10, In + tid*10, 10);
}
__global__ void MemCpy11(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memcpy(Out + tid*11, In + tid*11, 11);
}
__global__ void MemCpy12(hipLaunchParm lp, uint8_t *In, uint8_t *Out) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memcpy(Out + tid*12, In + tid*12, 12);
}
__global__ void MemSet8(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memset(In + tid*8, 1, 8);
}
__global__ void MemSet9(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memset(In + tid*9, 1, 9);
}
__global__ void MemSet10(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memset(In + tid*10, 1, 10);
}
__global__ void MemSet11(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memset(In + tid*11, 1, 11);
}
__global__ void MemSet12(hipLaunchParm lp, uint8_t *In) {
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
memset(In + tid*12, 1, 12);
}
@@ -107,9 +107,12 @@ int main(){
assert(C[i] == 1);
}
delete A;
delete B;
delete C;
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
A = new uint8_t[LEN9];
B = new uint8_t[LEN9];
@@ -132,9 +135,12 @@ int main(){
assert(C[i] == 1);
}
delete A;
delete B;
delete C;
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
A = new uint8_t[LEN10];
B = new uint8_t[LEN10];
@@ -157,9 +163,12 @@ int main(){
assert(C[i] == 1);
}
delete A;
delete B;
delete C;
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
A = new uint8_t[LEN11];
B = new uint8_t[LEN11];
@@ -182,9 +191,12 @@ int main(){
assert(C[i] == 1);
}
delete A;
delete B;
delete C;
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
A = new uint8_t[LEN12];
B = new uint8_t[LEN12];
@@ -207,9 +219,12 @@ int main(){
assert(C[i] == 1);
}
delete A;
delete B;
delete C;
delete [] A;
delete [] B;
delete [] C;
hipFree(Ad);
hipFree(Bd);
hipFree(Cd);
passed();
}
+9 -3
ファイルの表示
@@ -37,9 +37,15 @@ __global__ void vadd_asm(hipLaunchParm lp,
float *out,
float *in)
{
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i]));
#ifdef __HIP_PLATFORM_NVCC__
asm volatile("add.f32 %0,%1,%2;":"=f"(out[i]):"f"(in[i]),"f"(out[i]));
#endif
#ifdef __HIP_PLATFORM_HCC__
asm volatile ("v_add_f32_e32 %0, %1, %2" : "=v" (out[i]) : "v"(in[i]),"v" (out[i]));
#endif
}
// CPU implementation of Vector Result
@@ -49,7 +55,7 @@ void addCPUReference(
{
for(unsigned int j=0; j < NUM; j++)
{
output[j]= input[j] + output[j];
}
}
+2 -2
ファイルの表示
@@ -35,7 +35,7 @@ __global__ void vmac_asm(hipLaunchParm lp,
float *out,
float *in)
{
int i = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
int i = blockDim.x * blockIdx.x + threadIdx.x;
asm volatile ("v_mac_f32_e32 %0, %2, %3" : "=v" (out[i]) : "0"(out[i]), "v" (a), "v" (in[i]));
}
@@ -47,7 +47,7 @@ void CPUReference(
{
for(unsigned int j=0; j < NUM; j++)
{
output[j]= a*input[j] + output[j];
}
}
+1 -1
ファイルの表示
@@ -33,7 +33,7 @@ void
__launch_bounds__(256, 2)
myKern(hipLaunchParm lp, int *C, const int *A, int N, int xfactor)
{
int tid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int tid = (blockIdx.x * blockDim.x + threadIdx.x);
if (tid < N) {
C[tid] = A[tid];
+46
ファイルの表示
@@ -0,0 +1,46 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Conformance test for checking functionality of
* hipError_t hipDeviceComputeCapability(int* major, int* minor, hipDevice_t device);
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
#include "test_common.h"
int main()
{
int numDevices = 0;
int major,minor;
hipDevice_t device;
HIPCHECK(hipGetDeviceCount(&numDevices));
for(int i=0;i<numDevices;i++){
HIPCHECK(hipDeviceGet(&device,i));
HIPCHECK(hipDeviceComputeCapability(&major, &minor, device));
HIPASSERT(major >= 0);
HIPASSERT(minor >= 0);
}
passed();
}
+47
ファイルの表示
@@ -0,0 +1,47 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Conformance test for checking functionality of
* hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
#include "test_common.h"
#define len 256
int main()
{
int numDevices = 0;
char name[len];
hipDevice_t device;
HIPCHECK(hipGetDeviceCount(&numDevices));
for(int i=0;i<numDevices;i++){
HIPCHECK(hipDeviceGet(&device,i));
HIPCHECK(hipDeviceGetName(name,len,device));
HIPASSERT(name != "");
}
passed();
}
+74
ファイルの表示
@@ -0,0 +1,74 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Test for checking the functionality of
* hipError_t hipDeviceSynchronize();
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
#include"test_common.h"
#define _SIZE sizeof(int)*1024*1024
#define NUM_STREAMS 2
__global__ void Iter(hipLaunchParm lp, int *Ad, int num){
int tx = threadIdx.x + blockIdx.x * blockDim.x;
// Kernel loop designed to execute very slowly... ... ... so we can test timing-related behavior below
if(tx == 0){
for(int i = 0; i<num;i++){
Ad[tx] += 1;
}
}
}
int main(){
int *A[NUM_STREAMS];
int *Ad[NUM_STREAMS];
hipStream_t stream[NUM_STREAMS];
for(int i=0;i<NUM_STREAMS;i++){
HIPCHECK(hipHostMalloc((void**)&A[i], _SIZE, hipHostMallocDefault));
A[i][0] = 1;
HIPCHECK(hipMalloc((void**)&Ad[i], _SIZE));
HIPCHECK(hipStreamCreate(&stream[i]));
}
for(int i=0;i<NUM_STREAMS;i++){
HIPCHECK(hipMemcpyAsync(Ad[i], A[i], _SIZE, hipMemcpyHostToDevice, stream[i]));
}
for(int i=0;i<NUM_STREAMS;i++){
hipLaunchKernel(HIP_KERNEL_NAME(Iter), dim3(1), dim3(1), 0, stream[i], Ad[i], 1<<30);
}
for(int i=0;i<NUM_STREAMS;i++){
HIPCHECK(hipMemcpyAsync(A[i], Ad[i], _SIZE, hipMemcpyDeviceToHost, stream[i]));
}
// This first check but relies on the kernel running for so long that the D2H async memcopy has not started yet.
// This will be true in an optimal asynchronous implementation.
// Conservative implementations which synchronize the hipMemcpyAsync will fail, ie if HIP_LAUNCH_BLOCKING=true
HIPASSERT(1<<30 != A[NUM_STREAMS-1][0]-1);
HIPCHECK(hipDeviceSynchronize());
HIPASSERT(1<<30 == A[NUM_STREAMS-1][0]-1);
passed();
}
+45
ファイルの表示
@@ -0,0 +1,45 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Conformance test for checking functionality of
* hipError_t hipDeviceGetName(char* name, int len, hipDevice_t device);
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
#include "test_common.h"
int main()
{
int numDevices = 0;
size_t totMem;
hipDevice_t device;
HIPCHECK(hipGetDeviceCount(&numDevices));
for(int i=0;i<numDevices;i++){
HIPCHECK(hipDeviceGet(&device,i));
HIPCHECK(hipDeviceTotalMem(&totMem,device));
HIPASSERT(totMem != 0);
}
passed();
}
+44
ファイルの表示
@@ -0,0 +1,44 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Conformance test for checking functionality of
* hipError_t hipGetDevice(int *device);
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
#include "test_common.h"
int main()
{
int numDevices = 0;
int device;
HIPCHECK(hipGetDeviceCount(&numDevices));
for(int i=0;i<numDevices;i++){
HIPCHECK(hipSetDevice(i));
HIPCHECK(hipGetDevice(&device));
HIPASSERT(device == i);
}
passed();
}
+39
ファイルの表示
@@ -0,0 +1,39 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Conformance test for checking functionality of
* hipError_t hipRuntimeGetVersion(int* runtimeVersion);
* On HIP/HCC path this function returns HIP runtime patch version(a 5 digit code) however on HIP/NVCC path this function return CUDA runtime version.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
#include "test_common.h"
int main()
{
int runtimeVersion;
HIPCHECK(hipRuntimeGetVersion(&runtimeVersion));
passed();
}
+36
ファイルの表示
@@ -0,0 +1,36 @@
/*
Copyright (c) 2015-Present Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
#include<hip/hip_runtime_api.h>
#include<iostream>
#include"test_common.h"
int main(){
hipFuncCache_t cacheConfig;
void *func;
hipFuncSetCacheConfig(func, cacheConfig);
passed();
}
+36
ファイルの表示
@@ -0,0 +1,36 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t EXCLUDE_HIP_PLATFORM
* HIT_END
*/
#include "test_common.h"
int main(){
int numDevices = 0;
HIPCHECK(hipGetDeviceCount(&numDevices));
for(int i=0;i<numDevices;i++){
HIPCHECK(hipSetDevice(i));
}
HIPASSERT(hipErrorInvalidDevice == hipSetDevice(numDevices));
passed();
}
+52
ファイルの表示
@@ -0,0 +1,52 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
#include "test_common.h"
int main()
{
unsigned flag = 0;
HIPCHECK(hipDeviceReset());
int deviceCount = 0;
HIPCHECK(hipGetDeviceCount(&deviceCount));
for(int j=0;j<deviceCount;j++){
HIPCHECK(hipSetDevice(j));
for(int i=0;i<4;i++){
flag = 1 << i;
printf ("Flag=%x\n", flag);
HIPCHECK(hipSetDeviceFlags(flag));
//HIPCHECK_API(hipSetDeviceFlags(flag), hipErrorInvalidValue);
}
flag = 0;
}
passed();
}
+38
ファイルの表示
@@ -0,0 +1,38 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Conformance test for checking functionality of
* hipError_t hipGetDevice(int *device);
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
#include "test_common.h"
int main()
{
hipSetDevice(-1);
if(hipPeekAtLastError() != hipSuccess)
passed();
}
+10 -10
ファイルの表示
@@ -52,7 +52,7 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
if (!(testMask & p_tests)) {
return;
}
printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n",
printf ("\ntest 0x%3x: stream=%p waitStart=%d syncMode=%s\n",
testMask, stream, waitStart, syncModeString(syncMode));
size_t sizeBytes = numElements * sizeof(int);
@@ -85,8 +85,8 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
HIPCHECK(hipEventSynchronize(start));
}
hipError_t expectedStopError = hipSuccess;
hipError_t expectedStopError = hipSuccess;
// How to wait for the events to finish:
switch (syncMode) {
@@ -97,12 +97,12 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
HIPCHECK(hipStreamSynchronize(stream)); // wait for recording to finish...
break;
case syncStopEvent:
HIPCHECK(hipEventSynchronize(stop));
HIPCHECK(hipEventSynchronize(stop));
break;
default:
assert(0);
};
float t;
@@ -111,25 +111,25 @@ void test(unsigned testMask, int *C_d, int *C_h, int64_t numElements, hipStream_
failed ("start event not in expected state, was %d=%s\n", e, hipGetErrorName(e));
}
if (e == hipSuccess)
if (e == hipSuccess)
assert (t==0.0f);
// stop usually ready unless we skipped the synchronization (syncNone)
HIPCHECK_API(hipEventElapsedTime(&t, stop, stop), expectedStopError);
if (e == hipSuccess)
if (e == hipSuccess)
assert (t==0.0f);
e = hipEventElapsedTime(&t, start, stop);
HIPCHECK_API(e, expectedStopError);
if (expectedStopError == hipSuccess)
if (expectedStopError == hipSuccess)
assert (t>0.0f);
printf ("time=%6.2f error=%s\n", t, hipGetErrorName(e));
e = hipEventElapsedTime(&t, stop, start);
HIPCHECK_API(e, expectedStopError);
if (expectedStopError == hipSuccess)
if (expectedStopError == hipSuccess)
assert (t<0.0f);
printf ("negtime=%6.2f error=%s\n", t, hipGetErrorName(e));
+1 -1
ファイルの表示
@@ -33,7 +33,7 @@ THE SOFTWARE.
#define SIZE LEN*sizeof(float)
__global__ void Add(hipLaunchParm lp, float *Ad, float *Bd, float *Cd){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Cd[tx] = Ad[tx] + Bd[tx];
}
+13 -13
ファイルの表示
@@ -33,13 +33,13 @@
#define SIZE LEN*sizeof(float)
__global__ void Add(float *Ad, float *Bd, float *Cd){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Cd[tx] = Ad[tx] + Bd[tx];
}
__global__ void Set(int *Ad, int val){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tx] = val;
}
@@ -52,13 +52,13 @@ std::vector<std::string> syncMsg = {"event", "stream", "device"};
void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMethod, std::string msg)
{
std::cerr << "test: CheckHostPointer " << msg
std::cerr << "test: CheckHostPointer " << msg
//<< " HIP_COHERENT_HOST_ALLOC=" << HIP_COHERENT_HOST_ALLOC
//<< " HIP_EVENT_SYS_RELEASE=" << HIP_EVENT_SYS_RELEASE
<< " eventFlags = " << std::hex << eventFlags
<< ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "")
<< ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "")
<< " ptr=" << ptr
<< " eventFlags = " << std::hex << eventFlags
<< ((eventFlags & hipEventReleaseToDevice) ? " hipEventReleaseToDevice" : "")
<< ((eventFlags & hipEventReleaseToSystem) ? " hipEventReleaseToSystem" : "")
<< " ptr=" << ptr
<< " syncMethod=" << syncMsg[syncMethod] << "\n";
hipStream_t s;
@@ -93,7 +93,7 @@ void CheckHostPointer(int numElements, int *ptr, unsigned eventFlags, int syncMe
default:
assert(0);
};
for (int i=0; i<numElements; i++) {
if (ptr[i] != expected) {
printf ("mismatch at %d: %d != %d\n", i, ptr[i], expected);
@@ -153,7 +153,7 @@ int main(){
size_t sizeBytes = numElements * sizeof (int);
#ifdef __HIP_PLATFORM_HCC__
{
{
// Stimulate error condition:
int *A = &numElements;
HIPCHECK_API(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent|hipHostMallocNonCoherent), hipErrorInvalidValue);
@@ -174,7 +174,7 @@ int main(){
// agent-scope releases don't provide host visibility, don't use them here:
}
if (1) {
if (1) {
int *A = nullptr;
HIPCHECK(hipHostMalloc((void**)&A, sizeBytes, hipHostMallocCoherent));
const char *ptrType = "coherent";
@@ -189,14 +189,14 @@ int main(){
// Check defaults:
if (1) {
if (1) {
int *A = nullptr;
HIPCHECK(hipHostMalloc((void**)&A, sizeBytes));
const char *ptrType = "default";
CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType);
CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType);
CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType);
CheckHostPointer(numElements, A, 0, SYNC_DEVICE, ptrType);
CheckHostPointer(numElements, A, 0, SYNC_STREAM, ptrType);
CheckHostPointer(numElements, A, 0, SYNC_EVENT, ptrType);
@@ -206,7 +206,7 @@ int main(){
}
passed();
}
+6 -6
ファイルの表示
@@ -29,13 +29,13 @@ THE SOFTWARE.
#include<malloc.h>
__global__ void Inc(hipLaunchParm lp, float *Ad){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Ad[tx] = Ad[tx] + float(1);
}
template<typename T>
void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister)
void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internalRegister)
{
A = A + offset;
numElements -= offset;
@@ -56,7 +56,7 @@ void doMemCopy(size_t numElements, int offset, T *A, T *Bh, T *Bd, bool internal
HIPCHECK(hipMemset(Bd, 13.0f, sizeBytes));
//
//
HIPCHECK(hipMemcpy(Bd, A, sizeBytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(Bh, Bd, sizeBytes, hipMemcpyDeviceToHost));
@@ -81,7 +81,7 @@ int main(int argc, char *argv[])
const size_t size = N * sizeof(float);
if (p_tests & 0x1) {
if (p_tests & 0x1) {
float *A, **Ad;
int num_devices;
HIPCHECK(hipGetDeviceCount(&num_devices));
@@ -115,7 +115,7 @@ int main(int argc, char *argv[])
}
if (p_tests & 0x6) {
if (p_tests & 0x6) {
// Sensitize HIP bug if device does not match where the memory was registered.
HIPCHECK(hipSetDevice(0));
@@ -129,7 +129,7 @@ int main(int argc, char *argv[])
HIPCHECK(hipMalloc(&Bd, size));
// TODO - set to 128
#define OFFSETS_TO_TRY 128
#define OFFSETS_TO_TRY 128
assert (N>OFFSETS_TO_TRY);
if (p_tests & 0x2) {
+33 -33
ファイルの表示
@@ -58,7 +58,7 @@ public:
void offset(int offset) { _offset = offset; };
int offset() const { return _offset; };
private:
T * _A_d;
T* _B_d;
@@ -72,7 +72,7 @@ private:
template<typename T>
DeviceMemory<T>::DeviceMemory(size_t numElements)
: _maxNumElements(numElements),
: _maxNumElements(numElements),
_offset(0)
{
T ** np = nullptr;
@@ -93,7 +93,7 @@ DeviceMemory<T>::~DeviceMemory ()
HipTest::freeArrays (_A_d, _B_d, _C_d, np, np, np, 0);
HIPCHECK (hipFree(_C_dd));
_C_dd = NULL;
};
@@ -125,7 +125,7 @@ public:
T * A_hh;
T* B_hh;
bool _usePinnedHost;
bool _usePinnedHost;
private:
size_t _maxNumElements;
@@ -165,11 +165,11 @@ HostMemory<T>::HostMemory(size_t numElements, bool usePinnedHost)
template<typename T>
void
HostMemory<T>::reset(size_t numElements, bool full)
HostMemory<T>::reset(size_t numElements, bool full)
{
// Initialize the host data:
for (size_t i=0; i<numElements; i++) {
(A_hh)[i] = 1097.0 + i;
(A_hh)[i] = 1097.0 + i;
(B_hh)[i] = 1492.0 + i; // Phi
if (full) {
@@ -213,8 +213,8 @@ template <typename T>
void memcpytest2(DeviceMemory<T> *dmem, HostMemory<T> *hmem, size_t numElements, bool useHostToHost, bool useDeviceToDevice, bool useMemkindDefault)
{
size_t sizeElements = numElements * sizeof(T);
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n",
__func__,
printf ("test: %s<%s> size=%lu (%6.2fMB) usePinnedHost:%d, useHostToHost:%d, useDeviceToDevice:%d, useMemkindDefault:%d, offsets:dev:%+d host:+%d\n",
__func__,
TYPENAME(T),
sizeElements, sizeElements/1024.0/1024.0,
hmem->_usePinnedHost, useHostToHost, useDeviceToDevice, useMemkindDefault,
@@ -273,8 +273,8 @@ void memcpytest2_for_type(size_t numElements)
{
printSep();
DeviceMemory<T> memD(numElements);
HostMemory<T> memU(numElements, 0/*usePinnedHost*/);
DeviceMemory<T> memD(numElements);
HostMemory<T> memU(numElements, 0/*usePinnedHost*/);
HostMemory<T> memP(numElements, 1/*usePinnedHost*/);
for (int usePinnedHost =0; usePinnedHost<=1; usePinnedHost++) {
@@ -307,11 +307,11 @@ void memcpytest2_sizes(size_t maxElem=0)
maxElem = free/sizeof(T)/8;
}
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n",
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n",
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0);
HIPCHECK ( hipDeviceReset() );
DeviceMemory<T> memD(maxElem);
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
DeviceMemory<T> memD(maxElem);
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
HostMemory<T> memP(maxElem, 1/*usePinnedHost*/);
for (size_t elem=1; elem<=maxElem; elem*=2) {
@@ -336,11 +336,11 @@ void memcpytest2_offsets(size_t maxElem, bool devOffsets, bool hostOffsets)
HIPCHECK(hipMemGetInfo(&free, &total));
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n",
printf (" device#%d: hipMemGetInfo: free=%zu (%4.2fMB) total=%zu (%4.2fMB) maxSize=%6.1fMB\n",
deviceId, free, (float)(free/1024.0/1024.0), total, (float)(total/1024.0/1024.0), maxElem*sizeof(T)/1024.0/1024.0);
HIPCHECK ( hipDeviceReset() );
DeviceMemory<T> memD(maxElem);
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
DeviceMemory<T> memD(maxElem);
HostMemory<T> memU(maxElem, 0/*usePinnedHost*/);
HostMemory<T> memP(maxElem, 1/*usePinnedHost*/);
size_t elem = maxElem / 2;
@@ -380,16 +380,16 @@ void multiThread_1(bool serialize, bool usePinnedHost)
{
printSep();
printf ("test: %s<%s> serialize=%d usePinnedHost=%d\n", __func__, TYPENAME(T), serialize, usePinnedHost);
DeviceMemory<T> memD(N);
HostMemory<T> mem1(N, usePinnedHost);
HostMemory<T> mem2(N, usePinnedHost);
DeviceMemory<T> memD(N);
HostMemory<T> mem1(N, usePinnedHost);
HostMemory<T> mem2(N, usePinnedHost);
std::thread t1 (memcpytest2<T>, &memD, &mem1, N, 0,0,0);
if (serialize) {
t1.join();
}
std::thread t2 (memcpytest2<T>,&memD, &mem2, N, 0,0,0);
if (serialize) {
t2.join();
@@ -427,21 +427,21 @@ int main(int argc, char *argv[])
// Some tests around the 64KB boundary which have historically shown issues:
printf ("\n\n=== tests&0x2 (64KB boundary)\n");
size_t maxElem = 32*1024*1024;
DeviceMemory<float> memD(maxElem);
HostMemory<float> memU(maxElem, 0/*usePinnedHost*/);
HostMemory<float> memP(maxElem, 0/*usePinnedHost*/);
DeviceMemory<float> memD(maxElem);
HostMemory<float> memU(maxElem, 0/*usePinnedHost*/);
HostMemory<float> memP(maxElem, 0/*usePinnedHost*/);
// These all pass:
memcpytest2<float>(&memD, &memP, 15*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 15*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024+16*1024, 0, 0, 0);
// Just over 64MB:
memcpytest2<float>(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 17*1024*1024+1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memU, 32*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 0);
memcpytest2<float>(&memD, &memP, 16*1024*1024+512*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 17*1024*1024+1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memU, 32*1024*1024, 0, 0, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 0);
memcpytest2<float>(&memD, &memP, 32*1024*1024, 1, 1, 0);
}
@@ -464,7 +464,7 @@ int main(int argc, char *argv[])
// Simplest cases: serialize the threads, and also used pinned memory:
// This verifies that the sub-calls to memcpytest2 are correct.
multiThread_1<float>(true, true);
multiThread_1<float>(true, true);
// Serialize, but use unpinned memory to stress the unpinned memory xfer path.
multiThread_1<float>(true, false);
+12 -12
ファイルの表示
@@ -59,7 +59,7 @@ struct HostTraits<Pinned>
static const char *Name() { return "Pinned"; } ;
static void *Alloc(size_t sizeBytes) {
void *p;
void *p;
HIPCHECK(hipHostMalloc((void**)&p, sizeBytes, hipHostMallocDefault));
return p;
};
@@ -67,11 +67,11 @@ struct HostTraits<Pinned>
template<typename T>
__global__ void
__global__ void
addK (hipLaunchParm lp, T *A, T K, size_t numElements)
{
size_t offset = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
size_t stride = hipBlockDim_x * hipGridDim_x ;
size_t offset = (blockIdx.x * blockDim.x + threadIdx.x);
size_t stride = blockDim.x * gridDim.x ;
for (size_t i=offset; i<numElements; i+=stride) {
A[i] = A[i] + K;
@@ -85,7 +85,7 @@ addK (hipLaunchParm lp, T *A, T K, size_t numElements)
//IN: numInflight : number of copies inflight at any time:
//IN: numPongs = number of iterations to run (iteration)
template<typename T, class AllocType>
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide)
void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int numPongs, bool doHostSide)
{
HIPASSERT(numElements % numInflight == 0); // Must be evenly divisible.
size_t Nbytes = numElements*sizeof(T);
@@ -95,7 +95,7 @@ void test_pingpong(hipStream_t stream, size_t numElements, int numInflight, int
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, numElements);
printf ("-----------------------------------------------------------------------------------------------\n");
printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
printf ("testing: %s<%s> Nbytes=%zu (%6.1f MB) numPongs=%d numInflight=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
__func__, HostTraits<AllocType>::Name(), Nbytes, (double)(Nbytes)/1024.0/1024.0, numPongs, numInflight, eachCopyElements, eachCopyBytes);
T *A_h = NULL;
@@ -176,7 +176,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
size_t eachCopyBytes = eachCopyElements * sizeof(T);
printf ("-----------------------------------------------------------------------------------------------\n");
printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
printf ("testing: %s Nbytes=%zu (%6.1f MB) numCopies=%d eachCopyElements=%zu eachCopyBytes=%zu\n",
__func__, Nbytes, (double)(Nbytes)/1024.0/1024.0, numCopies, eachCopyElements, eachCopyBytes);
T *A_d;
@@ -194,7 +194,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
//stream=0; // fixme TODO
for (int i=0; i<numCopies; i++)
for (int i=0; i<numCopies; i++)
{
HIPASSERT(A_d + i*eachCopyElements < A_d + Nbytes);
HIPCHECK(hipMemcpyAsync(&A_d[i*eachCopyElements], &A_h1[i*eachCopyElements], eachCopyBytes, hipMemcpyHostToDevice, stream));
@@ -204,7 +204,7 @@ void test_manyInflightCopies(hipStream_t stream, int numElements, int numCopies,
HIPCHECK(hipDeviceSynchronize());
}
for (int i=0; i<numCopies; i++)
for (int i=0; i<numCopies; i++)
{
HIPASSERT(A_d + i*eachCopyElements < A_d + Nbytes);
HIPCHECK(hipMemcpyAsync(&A_h2[i*eachCopyElements], &A_d[i*eachCopyElements], eachCopyBytes, hipMemcpyDeviceToHost, stream));
@@ -252,7 +252,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
hipStream_t *stream = (hipStream_t*)malloc(sizeof(hipStream_t) * nStreams);
if (useNullStream) {
if (useNullStream) {
nStreams = 1;
stream[0] = NULL;
} else {
@@ -262,7 +262,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
}
size_t workLeft = N;
size_t workLeft = N;
size_t workPerStream = N / nStreams;
for (int i = 0; i < nStreams; ++i) {
size_t work = (workLeft < workPerStream) ? workLeft : workPerStream;
@@ -287,7 +287,7 @@ void test_chunkedAsyncExample(int nStreams, bool useNullStream, bool useSyncMemc
} else {
HIPCHECK ( hipMemcpyAsync(&C_h[offset], &C_d[offset], workBytes, hipMemcpyDeviceToHost, stream[i]));
}
}
}
HIPCHECK (hipDeviceSynchronize());
+80
ファイルの表示
@@ -0,0 +1,80 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Conformance test for checking functionality of
* hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId, size_t sizeBytes);
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* RUN: %t
* HIT_END
*/
#include "test_common.h"
int main()
{
hipDevice_t device;
size_t Nbytes = N*sizeof(int);
int numDevices = 0;
int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
int *A_h, *B_h, *C_h ;
HIPCHECK(hipGetDeviceCount(&numDevices));
if(numDevices > 1)
{
HIPCHECK(hipSetDevice(0));
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMalloc(&X_d,Nbytes));
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyDtoD(X_d, A_d, Nbytes));
HIPCHECK(hipMemcpyDtoD(Y_d, B_d, Nbytes));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
HIPCHECK(hipMemcpyDtoH(C_h, Z_d, Nbytes));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
HIPCHECK(hipFree(Y_d));
HIPCHECK(hipFree(Z_d));
}
passed();
}
+84
ファイルの表示
@@ -0,0 +1,84 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Conformance test for checking functionality of
* hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId, size_t sizeBytes);
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* RUN: %t
* HIT_END
*/
#include "test_common.h"
int main()
{
hipDevice_t device;
size_t Nbytes = N*sizeof(int);
int numDevices = 0;
int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
int *A_h, *B_h, *C_h ;
hipStream_t s;
HIPCHECK(hipGetDeviceCount(&numDevices));
if(numDevices > 1)
{
HIPCHECK(hipSetDevice(0));
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMalloc(&X_d,Nbytes));
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyDtoDAsync(X_d, A_d, Nbytes, s));
HIPCHECK(hipMemcpyDtoDAsync(Y_d, B_d, Nbytes, s));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
HIPCHECK(hipMemcpyDtoHAsync(C_h, Z_d, Nbytes, s));
HIPCHECK(hipStreamSynchronize(s));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamDestroy(s));
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
HIPCHECK(hipFree(Y_d));
HIPCHECK(hipFree(Z_d));
}
passed();
}
+80
ファイルの表示
@@ -0,0 +1,80 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Conformance test for checking functionality of
* hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId, size_t sizeBytes);
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp
* RUN: %t
* HIT_END
*/
#include "test_common.h"
int main()
{
hipDevice_t device;
size_t Nbytes = N*sizeof(int);
int numDevices = 0;
int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
int *A_h, *B_h, *C_h ;
HIPCHECK(hipGetDeviceCount(&numDevices));
if(numDevices > 1)
{
HIPCHECK(hipSetDevice(0));
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMalloc(&X_d,Nbytes));
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK(hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK(hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
HIPCHECK(hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipSetDevice(1));
hipMemcpyPeer(X_d, 1, A_d, 0, Nbytes); //this call is eqv to hipMemcpy(hipMemcpyD2D) which goes via stg bufs.
hipMemcpyPeer(Y_d, 1, B_d, 0, Nbytes);
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
HIPCHECK(hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK(hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
HIPCHECK(hipFree(Y_d));
HIPCHECK(hipFree(Z_d));
}
passed();
}
+85
ファイルの表示
@@ -0,0 +1,85 @@
/*
Copyright (c) 2015-2017 Advanced Micro Devices, Inc. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANNTY OF ANY KIND, EXPRESS OR
IMPLIED, INNCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANNY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER INN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR INN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
/*
* Conformance test for checking functionality of
* hipError_t hipMemcpyPeer(void* dst, int dstDeviceId, const void* src, int srcDeviceId, size_t sizeBytes);
*/
/* HIT_START
* BUILD: %t %s ../../test_common.cpp EXCLUDE_HIP_PLATFORM nvcc
* RUN: %t
* HIT_END
*/
#include "test_common.h"
int main()
{
hipDevice_t device;
size_t Nbytes = N*sizeof(int);
int numDevices = 0;
int *A_d, *B_d, *C_d, *X_d, *Y_d, *Z_d;
int *A_h, *B_h, *C_h ;
hipStream_t s;
HIPCHECK(hipGetDeviceCount(&numDevices));
if(numDevices > 1)
{
HIPCHECK(hipSetDevice(0));
unsigned blocks = HipTest::setNumBlocks(blocksPerCU, threadsPerBlock, N);
HipTest::initArrays(&A_d, &B_d, &C_d, &A_h, &B_h, &C_h, N, false);
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMalloc(&X_d,Nbytes));
HIPCHECK(hipMalloc(&Y_d,Nbytes));
HIPCHECK(hipMalloc(&Z_d,Nbytes));
HIPCHECK(hipSetDevice(0));
HIPCHECK ( hipMemcpy(A_d, A_h, Nbytes, hipMemcpyHostToDevice));
HIPCHECK ( hipMemcpy(B_d, B_h, Nbytes, hipMemcpyHostToDevice));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, A_d,B_d, C_d, N);
HIPCHECK ( hipMemcpy(C_h, C_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK (hipDeviceSynchronize());
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamCreate(&s));
HIPCHECK(hipSetDevice(1));
HIPCHECK(hipMemcpyPeerAsync(X_d, 1, A_d, 0, Nbytes, s));
HIPCHECK(hipMemcpyPeerAsync(Y_d, 1, B_d, 0, Nbytes, s));
hipLaunchKernel(HipTest::vectorADD, dim3(blocks), dim3(threadsPerBlock), 0, 0, X_d,Y_d, Z_d, N);
HIPCHECK ( hipMemcpy(C_h, Z_d, Nbytes, hipMemcpyDeviceToHost));
HIPCHECK (hipDeviceSynchronize());
HIPCHECK (hipStreamSynchronize(s));
HipTest::checkVectorADD(A_h, B_h, C_h, N);
HIPCHECK(hipStreamDestroy(s));
HipTest::freeArrays(A_d, B_d, C_d, A_h, B_h, C_h, false);
HIPCHECK(hipFree(X_d));
HIPCHECK(hipFree(Y_d));
HIPCHECK(hipFree(Z_d));
}
passed();
}
+1 -1
ファイルの表示
@@ -31,7 +31,7 @@ THE SOFTWARE.
__global__ void Kernel(hipLaunchParm lp,volatile float* hostRes)
{
int tid = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tid = threadIdx.x + blockIdx.x * blockDim.x;
hostRes[tid] = tid + 1;
__threadfence_system();
// expecting that the data is getting flushed to host here!
+29 -18
ファイルの表示
@@ -24,7 +24,7 @@ THE SOFTWARE.
/* HIT_START
* BUILD: %t %s ../../test_common.cpp NVCC_OPTIONS --std=c++11
* RUN: %t EXCLUDE_HIP_PLATFORM all
* RUN: %t EXCLUDE_HIP_PLATFORM all
* HIT_END
*/
@@ -33,11 +33,13 @@ THE SOFTWARE.
#ifdef __HIP_PLATFORM_HCC__
#include <hc_am.hpp>
#define USE_HCC_MEMTRACKER 0
#endif
#define USE_HCC_MEMTRACKER 0 /* Debug flag to show the memtracker periodically */
int elementSizes[] = {16, 1024,524288};
int elementSizes[] = {1, 16, 1024, 524288, 16*1000*1000};
int nSizes = sizeof(elementSizes) / sizeof(int);
int enablePeers(int dev0, int dev1)
@@ -57,26 +59,30 @@ int enablePeers(int dev0, int dev1)
return 0;
};
// Set value of array to specified 32-bit integer:
__global__ void
memsetIntKernel(/*hipLaunchParm lp,*/ int * ptr, const int val, size_t numElements)
memsetIntKernel(int * ptr, const int val, size_t numElements)
{
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
for (size_t i= gid; i< numElements; i+=stride){
ptr[i] = val;
}
};
__global__ void
memcpyIntKernel(/*hipLaunchParm lp, */const int * src, int* dst, size_t numElements)
memcpyIntKernel(const int * src, int* dst, size_t numElements)
{
int gid = (hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x);
int stride = hipBlockDim_x * hipGridDim_x ;
int gid = (blockIdx.x * blockDim.x + threadIdx.x);
int stride = blockDim.x * gridDim.x ;
for (size_t i= gid; i< numElements; i+=stride){
dst[i] = src[i];
}
};
// CHeck arrays in reverse order, to more easily detect cases where
// the copy is "partially" done.
void checkReverse(const int *ptr, int numElements, int expected) {
for (int i=numElements-1; i>=0; i--) {
if (ptr[i] != expected) {
@@ -88,7 +94,8 @@ void checkReverse(const int *ptr, int numElements, int expected) {
printf ("test: OK\n");
}
void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements,
void runTestImpl(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_t gpu1Stream, int numElements,
int * dataGpu0_0, int * dataGpu0_1, int *dataGpu1, int *dataHost, int expected)
{
hipEvent_t e;
@@ -96,7 +103,7 @@ void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_
HIPCHECK(hipEventCreateWithFlags(&e,0));
}
const size_t sizeElements = numElements * sizeof(int);
printf ("test: runTest with %zu bytes %s with hostSync %s\n", sizeElements, stepAIsCopy ? "copy" : "kernel", hostSync ? "enabled" : "disabled");
printf ("test: runTestImpl with %zu bytes %s with hostSync %s\n", sizeElements, stepAIsCopy ? "copy" : "kernel", hostSync ? "enabled" : "disabled");
hipStream_t stepAStream = gpu0Stream;
@@ -127,9 +134,12 @@ void runTest(bool stepAIsCopy, bool hostSync, hipStream_t gpu0Stream, hipStream_
HIPCHECK(hipStreamSynchronize(gpu0Stream));
checkReverse(dataHost, numElements, expected);
if(!hostSync) {
HIPCHECK(hipEventDestroy(e));
}
}
void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync, bool useMemcpy)
void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync)
{
const size_t sizeElements = numElements * sizeof(int);
@@ -163,12 +173,15 @@ void testMultiGpu(int dev0, int dev1, int numElements, bool hostSync, bool useMe
#endif
printf (" test: init complete\n");
runTest(useMemcpy , hostSync, gpu0Stream, gpu1Stream, numElements, dataGpu0_0,dataGpu0_1, dataGpu1, dataHost, expected);
runTestImpl(true, hostSync, gpu0Stream, gpu1Stream, numElements, dataGpu0_0,dataGpu0_1, dataGpu1, dataHost, expected);
HIPCHECK(hipFree(dataGpu0_0));
HIPCHECK(hipFree(dataGpu0_1));
HIPCHECK(hipFree(dataGpu1));
HIPCHECK(hipHostFree(dataHost));
HIPCHECK(hipStreamDestroy(gpu0Stream));
HIPCHECK(hipStreamDestroy(gpu1Stream));
};
int main(int argc, char *argv[])
@@ -192,11 +205,9 @@ int main(int argc, char *argv[])
return -1;
};
for(int index = 1;index < nSizes;index++) {
testMultiGpu(dev0, dev1, elementSizes[index] , false /* GPU Synchronization*/, true);
testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/, true);
testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/, false);
testMultiGpu(dev0, dev1, elementSizes[index] , false /*Host Synchronization*/, false);
for(int index = 0;index < nSizes;index++) {
testMultiGpu(dev0, dev1, elementSizes[index] , false /*GPU Synchronization*/);
testMultiGpu(dev0, dev1, elementSizes[index] , true /*Host Synchronization*/);
}
+1 -1
ファイルの表示
@@ -34,7 +34,7 @@ THE SOFTWARE.
#define kernel_name "hello_world"
__global__ void Cpy(hipLaunchParm lp, float *Ad, float* Bd){
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
Bd[tx] = Ad[tx];
}
+1 -1
ファイルの表示
@@ -24,7 +24,7 @@ THE SOFTWARE.
extern "C" __global__ void hello_world(hipLaunchParm lp, float *a, float *b)
{
int tx = hipThreadIdx_x;
int tx = threadIdx.x;
b[tx] = a[tx];
}
+2 -2
ファイルの表示
@@ -41,8 +41,8 @@ void printSep()
// Designed to stress a small number of simple smoke tests
template<
typename T=float,
class P=HipTest::Unpinned,
typename T=float,
class P=HipTest::Unpinned,
class C=HipTest::Memcpy
>
void simpleVectorAdd(size_t numElements, int iters, hipStream_t stream)
+3 -3
ファイルの表示
@@ -35,7 +35,7 @@ THE SOFTWARE.
template<typename T>
__global__ void Inc(hipLaunchParm lp, T *Array){
int tx = hipThreadIdx_x + hipBlockIdx_x * hipBlockDim_x;
int tx = threadIdx.x + blockIdx.x * blockDim.x;
Array[tx] = Array[tx] + T(1);
}
@@ -116,7 +116,7 @@ int main(int argc, char **argv)
}
const size_t size = N * sizeof(float);
for (int i=0; i< iterations; i++) {
std::thread t1(run1, size, stream[0]);
@@ -126,7 +126,7 @@ int main(int argc, char **argv)
// std::cout<<"T1"<<std::endl;
t2.join();
// std::cout<<"T2"<<std::endl;
t3.join();
t3.join();
}
passed();
}

変更されたファイルが多すぎるため、一部のファイルは表示されません さらに表示